-
Notifications
You must be signed in to change notification settings - Fork 357
[ENG-9828] - Backfill CedarMetadataRecord from CollectionSubmission custom metadata #11740
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
adlius
merged 7 commits into
CenterForOpenScience:feature/es2-consolidation
from
Vlad0n20:feature/ENG-9828
Jun 16, 2026
Merged
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
8fd3986
Backfill CedarMetadataRecord from CollectionSubmission custom metadata
Vlad0n20 dc1bbae
fix comment
Vlad0n20 b90c03a
Remove TestCopyCollectionSubmissionMetadataToCedar tests
Vlad0n20 29e9b04
fix test
Vlad0n20 7e03a03
Restore and fix tests
Vlad0n20 2445143
Add command tests
Vlad0n20 f90c3f6
Fix @context building and add validation in sync_cedar_metadata
Vlad0n20 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
73 changes: 73 additions & 0 deletions
73
osf/management/commands/copy_collection_submission_metadata_to_cedar.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,73 @@ | ||
| import logging | ||
|
|
||
| from django.core.management.base import BaseCommand | ||
|
|
||
| from osf.models import CollectionSubmission | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def copy_collection_submission_metadata_to_cedar(dry_run=False, batch_size=100, provider_id=None): | ||
| qs = CollectionSubmission.objects.filter( | ||
| collection__provider__required_metadata_template__isnull=False, | ||
| ).select_related( | ||
| 'guid', | ||
| 'collection__provider__required_metadata_template', | ||
| ) | ||
|
|
||
| if provider_id: | ||
| qs = qs.filter(collection__provider___id=provider_id) | ||
|
|
||
| total = qs.count() | ||
| logger.info(f'{"[DRY RUN] " if dry_run else ""}Found {total} collection submissions to process') | ||
|
|
||
| processed = errors = 0 | ||
| for submission in qs.iterator(chunk_size=batch_size): | ||
| if dry_run: | ||
| logger.info(f'[DRY RUN] Would sync cedar metadata for submission {submission._id}') | ||
| continue | ||
| try: | ||
| submission.sync_cedar_metadata() | ||
| processed += 1 | ||
| except Exception as e: | ||
| logger.error(f'Failed to sync cedar metadata for submission {submission._id}: {e}') | ||
| errors += 1 | ||
|
|
||
| logger.info( | ||
| f'{"[DRY RUN] " if dry_run else ""}' | ||
| f'Done. Processed {processed}/{total} submissions' | ||
| f'{f", {errors} error(s)" if errors else ""}' | ||
| ) | ||
|
|
||
|
|
||
| class Command(BaseCommand): | ||
| help = 'Copy CollectionSubmission custom metadata fields to CedarMetadataRecord for providers with a required cedar template.' | ||
|
|
||
| def add_arguments(self, parser): | ||
| super().add_arguments(parser) | ||
| parser.add_argument( | ||
| '--dry-run', | ||
| action='store_true', | ||
| dest='dry_run', | ||
| help='Preview what would be synced without making any changes', | ||
| ) | ||
| parser.add_argument( | ||
| '--batch-size', | ||
| type=int, | ||
| default=100, | ||
| dest='batch_size', | ||
| help='Number of submissions to process per iteration (default: 100)', | ||
| ) | ||
| parser.add_argument( | ||
| '--provider', | ||
| type=str, | ||
| dest='provider_id', | ||
| help='Optional collection provider _id to limit processing to a single provider', | ||
| ) | ||
|
|
||
| def handle(self, *args, **options): | ||
| copy_collection_submission_metadata_to_cedar( | ||
| dry_run=options['dry_run'], | ||
| batch_size=options['batch_size'], | ||
| provider_id=options.get('provider_id'), | ||
| ) | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
210 changes: 210 additions & 0 deletions
210
osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,210 @@ | ||
| import pytest | ||
| from faker import Faker | ||
| from unittest import mock | ||
|
|
||
| from django.core.management import call_command | ||
|
|
||
| from osf.management.commands.copy_collection_submission_metadata_to_cedar import ( | ||
| copy_collection_submission_metadata_to_cedar, | ||
| ) | ||
| from osf.models import CollectionSubmission, CedarMetadataRecord, CedarMetadataTemplate | ||
| from osf_tests.factories import ( | ||
| CollectionFactory, | ||
| CollectionProviderFactory, | ||
| NodeFactory, | ||
| ) | ||
| from tests.utils import capture_notifications | ||
|
|
||
| fake = Faker() | ||
|
|
||
|
|
||
| def make_cedar_template(): | ||
| return CedarMetadataTemplate.objects.create( | ||
| schema_name=fake.bs(), | ||
| cedar_id=fake.md5(), | ||
| template_version=1, | ||
| template={}, | ||
| active=True, | ||
| ) | ||
|
|
||
|
|
||
| def make_collection(provider): | ||
| collection = CollectionFactory() | ||
| collection.provider = provider | ||
| collection.save() | ||
| return collection | ||
|
|
||
|
|
||
| def make_submission(collection, **fields): | ||
| node = NodeFactory(is_public=True) | ||
| submission = CollectionSubmission( | ||
| guid=node.guids.first(), | ||
| collection=collection, | ||
| creator=node.creator, | ||
| **fields, | ||
| ) | ||
| with capture_notifications(): | ||
| submission.save() | ||
| return submission | ||
|
|
||
|
|
||
| @pytest.fixture() | ||
| def cedar_template(): | ||
| return make_cedar_template() | ||
|
|
||
|
|
||
| @pytest.fixture() | ||
| def provider_with_template(cedar_template): | ||
| provider = CollectionProviderFactory() | ||
| provider.required_metadata_template = cedar_template | ||
| provider.save() | ||
| return provider | ||
|
|
||
|
|
||
| @pytest.fixture() | ||
| def provider_without_template(): | ||
| return CollectionProviderFactory() | ||
|
|
||
|
|
||
| @pytest.mark.django_db | ||
| class TestCopyCollectionSubmissionMetadataToCedar: | ||
|
|
||
| def test_creates_record_for_submission_with_template(self, provider_with_template, cedar_template): | ||
| collection = make_collection(provider_with_template) | ||
| submission = make_submission(collection, collected_type='software', status='active') | ||
|
|
||
| copy_collection_submission_metadata_to_cedar() | ||
|
|
||
| assert CedarMetadataRecord.objects.filter( | ||
| guid=submission.guid, | ||
| template=cedar_template, | ||
| ).exists() | ||
|
|
||
| def test_record_contains_non_empty_fields_only(self, provider_with_template, cedar_template): | ||
| collection = make_collection(provider_with_template) | ||
| submission = make_submission(collection, collected_type='dataset', status='', volume='') | ||
|
|
||
| copy_collection_submission_metadata_to_cedar() | ||
|
|
||
| record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) | ||
| assert record.metadata == {'collected_type': 'dataset'} | ||
|
|
||
| def test_record_is_published(self, provider_with_template, cedar_template): | ||
| collection = make_collection(provider_with_template) | ||
| submission = make_submission(collection, status='active') | ||
|
|
||
| copy_collection_submission_metadata_to_cedar() | ||
|
|
||
| record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) | ||
| assert record.is_published is True | ||
|
|
||
| def test_updates_existing_record(self, provider_with_template, cedar_template): | ||
| collection = make_collection(provider_with_template) | ||
| submission = make_submission(collection, status='new') | ||
| CedarMetadataRecord.objects.create( | ||
| guid=submission.guid, | ||
| template=cedar_template, | ||
| metadata={'status': 'old'}, | ||
| is_published=False, | ||
| ) | ||
|
|
||
| copy_collection_submission_metadata_to_cedar() | ||
|
|
||
| record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) | ||
| assert record.metadata == {'status': 'new'} | ||
| assert record.is_published is True | ||
|
|
||
| def test_skips_submissions_without_required_template(self, provider_without_template): | ||
| collection = make_collection(provider_without_template) | ||
| submission = make_submission(collection, collected_type='software') | ||
|
|
||
| copy_collection_submission_metadata_to_cedar() | ||
|
|
||
| assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists() | ||
|
|
||
| def test_dry_run_makes_no_changes(self, provider_with_template, cedar_template): | ||
| collection = make_collection(provider_with_template) | ||
| submission = make_submission(collection, collected_type='software') | ||
|
|
||
| copy_collection_submission_metadata_to_cedar(dry_run=True) | ||
|
|
||
| assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists() | ||
|
|
||
| def test_provider_filter_processes_only_matching_provider(self, cedar_template): | ||
| provider_a = CollectionProviderFactory() | ||
| provider_a.required_metadata_template = cedar_template | ||
| provider_a.save() | ||
|
|
||
| provider_b = CollectionProviderFactory() | ||
| provider_b.required_metadata_template = make_cedar_template() | ||
| provider_b.save() | ||
|
|
||
| sub_a = make_submission(make_collection(provider_a), collected_type='software') | ||
| sub_b = make_submission(make_collection(provider_b), collected_type='dataset') | ||
|
|
||
| copy_collection_submission_metadata_to_cedar(provider_id=provider_a._id) | ||
|
|
||
| assert CedarMetadataRecord.objects.filter(guid=sub_a.guid, template=cedar_template).exists() | ||
| assert not CedarMetadataRecord.objects.filter(guid=sub_b.guid).exists() | ||
|
|
||
| def test_error_on_one_does_not_stop_others(self, provider_with_template, cedar_template): | ||
| collection = make_collection(provider_with_template) | ||
| make_submission(collection, collected_type='software') | ||
| make_submission(collection, collected_type='dataset') | ||
|
|
||
| with mock.patch.object(CollectionSubmission, 'sync_cedar_metadata') as mock_sync: | ||
| mock_sync.side_effect = [Exception('simulated error'), None] | ||
| copy_collection_submission_metadata_to_cedar() | ||
|
|
||
| assert mock_sync.call_count == 2 | ||
|
|
||
| def test_call_command_interface(self, provider_with_template, cedar_template): | ||
| collection = make_collection(provider_with_template) | ||
| submission = make_submission(collection, collected_type='software') | ||
|
|
||
| call_command('copy_collection_submission_metadata_to_cedar') | ||
|
|
||
| assert CedarMetadataRecord.objects.filter( | ||
| guid=submission.guid, | ||
| template=cedar_template, | ||
| ).exists() | ||
|
|
||
| def test_call_command_dry_run(self, provider_with_template, cedar_template): | ||
| collection = make_collection(provider_with_template) | ||
| submission = make_submission(collection, collected_type='software') | ||
|
|
||
| call_command('copy_collection_submission_metadata_to_cedar', dry_run=True) | ||
|
|
||
| assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists() | ||
|
|
||
| def test_all_cedar_fields_copied(self, provider_with_template, cedar_template): | ||
| collection = make_collection(provider_with_template) | ||
| submission = make_submission( | ||
| collection, | ||
| collected_type='software', | ||
| status='active', | ||
| volume='1', | ||
| issue='2', | ||
| program_area='health', | ||
| school_type='university', | ||
| study_design='rct', | ||
| data_type='quantitative', | ||
| disease='cancer', | ||
| grade_levels='K-12', | ||
| ) | ||
|
|
||
| copy_collection_submission_metadata_to_cedar() | ||
|
|
||
| record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) | ||
| assert record.metadata == { | ||
| 'collected_type': 'software', | ||
| 'status': 'active', | ||
| 'volume': '1', | ||
| 'issue': '2', | ||
| 'program_area': 'health', | ||
| 'school_type': 'university', | ||
| 'study_design': 'rct', | ||
| 'data_type': 'quantitative', | ||
| 'disease': 'cancer', | ||
| 'grade_levels': 'K-12', | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.