From 8fd398613bd701cf4257a4e9c299a5ee8ff39df6 Mon Sep 17 00:00:00 2001 From: Vlad0n20 Date: Fri, 15 May 2026 17:08:13 +0300 Subject: [PATCH 1/7] Backfill CedarMetadataRecord from CollectionSubmission custom metadata --- ...collection_submission_metadata_to_cedar.py | 73 ++++++ ...collection_submission_metadata_to_cedar.py | 220 ++++++++++++++++++ 2 files changed, 293 insertions(+) create mode 100644 osf/management/commands/copy_collection_submission_metadata_to_cedar.py create mode 100644 osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py diff --git a/osf/management/commands/copy_collection_submission_metadata_to_cedar.py b/osf/management/commands/copy_collection_submission_metadata_to_cedar.py new file mode 100644 index 00000000000..255228b4c96 --- /dev/null +++ b/osf/management/commands/copy_collection_submission_metadata_to_cedar.py @@ -0,0 +1,73 @@ +import logging + +from django.core.management.base import BaseCommand + +from osf.models import CollectionSubmission + +logger = logging.getLogger(__name__) + + +def copy_collection_submission_metadata_to_cedar(dry_run=False, batch_size=100, provider_id=None): + qs = CollectionSubmission.objects.filter( + collection__provider__required_metadata_template__isnull=False, + ).select_related( + 'guid', + 'collection__provider__required_metadata_template', + ) + + if provider_id: + qs = qs.filter(collection__provider___id=provider_id) + + total = qs.count() + logger.info(f'{"[DRY RUN] " if dry_run else ""}Found {total} collection submissions to process') + + processed = errors = 0 + for submission in qs.iterator(chunk_size=batch_size): + if dry_run: + logger.info(f'[DRY RUN] Would sync cedar metadata for submission {submission._id}') + continue + try: + submission.sync_cedar_metadata() + processed += 1 + except Exception as e: + logger.error(f'Failed to sync cedar metadata for submission {submission._id}: {e}') + errors += 1 + + logger.info( + f'{"[DRY RUN] " if dry_run else ""}' + f'Done. Processed {processed}/{total} submissions' + f'{f", {errors} error(s)" if errors else ""}' + ) + + +class Command(BaseCommand): + help = 'Copy CollectionSubmission custom metadata fields to CedarMetadataRecord for providers with a required cedar template.' + + def add_arguments(self, parser): + super().add_arguments(parser) + parser.add_argument( + '--dry-run', + action='store_true', + dest='dry_run', + help='Preview what would be synced without making any changes', + ) + parser.add_argument( + '--batch-size', + type=int, + default=100, + dest='batch_size', + help='Number of submissions to process per iteration (default: 100)', + ) + parser.add_argument( + '--provider', + type=str, + dest='provider_id', + help='Optional collection provider _id to limit processing to a single provider', + ) + + def handle(self, *args, **options): + copy_collection_submission_metadata_to_cedar( + dry_run=options['dry_run'], + batch_size=options['batch_size'], + provider_id=options.get('provider_id'), + ) diff --git a/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py b/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py new file mode 100644 index 00000000000..9c800b34c8c --- /dev/null +++ b/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py @@ -0,0 +1,220 @@ +import pytest +from faker import Faker +from unittest import mock + +from django.core.management import call_command + +from osf.models import CollectionSubmission, CedarMetadataRecord, CedarMetadataTemplate +from osf.management.commands.copy_collection_submission_metadata_to_cedar import ( + copy_collection_submission_metadata_to_cedar, +) +from osf_tests.factories import ( + CollectionFactory, + CollectionProviderFactory, + NodeFactory, +) +from tests.utils import capture_notifications + +fake = Faker() + + +def make_cedar_template(): + return CedarMetadataTemplate.objects.create( + schema_name=fake.bs(), + cedar_id=fake.md5(), + template_version=1, + template={}, + active=True, + ) + + +def make_collection(provider): + collection = CollectionFactory() + collection.provider = provider + collection.save() + return collection + + +def make_submission(collection, **fields): + node = NodeFactory(is_public=True) + submission = CollectionSubmission( + guid=node.guids.first(), + collection=collection, + creator=node.creator, + **fields, + ) + with capture_notifications(): + submission.save() + return submission + + +@pytest.fixture() +def cedar_template(): + return make_cedar_template() + + +@pytest.fixture() +def provider_with_template(cedar_template): + provider = CollectionProviderFactory() + provider.required_metadata_template = cedar_template + provider.save() + return provider + + +@pytest.fixture() +def provider_without_template(): + return CollectionProviderFactory() + + +@pytest.mark.django_db +class TestCopyCollectionSubmissionMetadataToCedar: + + def test_creates_record_for_submission_with_template(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, collected_type='software', status='active') + + copy_collection_submission_metadata_to_cedar() + + assert CedarMetadataRecord.objects.filter( + guid=submission.guid, + template=cedar_template, + ).exists() + + def test_record_contains_non_empty_fields_only(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, collected_type='dataset', status='', volume='') + + copy_collection_submission_metadata_to_cedar() + + record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) + assert record.metadata == {'collected_type': 'dataset'} + + def test_record_is_published(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, status='active') + + copy_collection_submission_metadata_to_cedar() + + record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) + assert record.is_published is True + + def test_updates_existing_record(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, status='new') + CedarMetadataRecord.objects.create( + guid=submission.guid, + template=cedar_template, + metadata={'status': 'old'}, + is_published=False, + ) + + copy_collection_submission_metadata_to_cedar() + + record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) + assert record.metadata == {'status': 'new'} + assert record.is_published is True + + def test_skips_submissions_without_required_template(self, provider_without_template): + collection = make_collection(provider_without_template) + submission = make_submission(collection, collected_type='software') + + copy_collection_submission_metadata_to_cedar() + + assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists() + + def test_dry_run_makes_no_changes(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, collected_type='software') + + copy_collection_submission_metadata_to_cedar(dry_run=True) + + assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists() + + def test_provider_filter_processes_only_matching_provider(self, cedar_template): + provider_a = CollectionProviderFactory() + provider_a.required_metadata_template = cedar_template + provider_a.save() + + provider_b = CollectionProviderFactory() + provider_b.required_metadata_template = make_cedar_template() + provider_b.save() + + sub_a = make_submission(make_collection(provider_a), collected_type='software') + sub_b = make_submission(make_collection(provider_b), collected_type='dataset') + + copy_collection_submission_metadata_to_cedar(provider_id=provider_a._id) + + assert CedarMetadataRecord.objects.filter(guid=sub_a.guid, template=cedar_template).exists() + assert not CedarMetadataRecord.objects.filter(guid=sub_b.guid).exists() + + def test_error_on_one_does_not_stop_others(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + make_submission(collection, collected_type='software') + make_submission(collection, collected_type='dataset') + + call_count = 0 + original_sync = CollectionSubmission.sync_cedar_metadata + + def sync_side_effect(self): + nonlocal call_count + call_count += 1 + if call_count == 1: + raise Exception('simulated error') + original_sync(self) + + with mock.patch.object(CollectionSubmission, 'sync_cedar_metadata', sync_side_effect): + copy_collection_submission_metadata_to_cedar() + + assert call_count == 2 + assert CedarMetadataRecord.objects.filter(template=cedar_template).count() == 1 + + def test_call_command_interface(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, collected_type='software') + + call_command('copy_collection_submission_metadata_to_cedar') + + assert CedarMetadataRecord.objects.filter( + guid=submission.guid, + template=cedar_template, + ).exists() + + def test_call_command_dry_run(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, collected_type='software') + + call_command('copy_collection_submission_metadata_to_cedar', '--dry-run') + + assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists() + + def test_all_cedar_fields_copied(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission( + collection, + collected_type='software', + status='active', + volume='1', + issue='2', + program_area='health', + school_type='university', + study_design='rct', + data_type='quantitative', + disease='cancer', + grade_levels='K-12', + ) + + copy_collection_submission_metadata_to_cedar() + + record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) + assert record.metadata == { + 'collected_type': 'software', + 'status': 'active', + 'volume': '1', + 'issue': '2', + 'program_area': 'health', + 'school_type': 'university', + 'study_design': 'rct', + 'data_type': 'quantitative', + 'disease': 'cancer', + 'grade_levels': 'K-12', + } From dc1bbae75607a177dcd6ce8adfc812dad1067b22 Mon Sep 17 00:00:00 2001 From: Vlad0n20 Date: Mon, 25 May 2026 14:40:16 +0300 Subject: [PATCH 2/7] fix comment --- .../test_copy_collection_submission_metadata_to_cedar.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py b/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py index 9c800b34c8c..1f92df1e6da 100644 --- a/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py +++ b/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py @@ -87,7 +87,7 @@ def test_record_contains_non_empty_fields_only(self, provider_with_template, ced copy_collection_submission_metadata_to_cedar() record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) - assert record.metadata == {'collected_type': 'dataset'} + assert record.metadata == {'collected_type': 'dataset', '@context': cedar_template.cedar_id} def test_record_is_published(self, provider_with_template, cedar_template): collection = make_collection(provider_with_template) @@ -111,7 +111,7 @@ def test_updates_existing_record(self, provider_with_template, cedar_template): copy_collection_submission_metadata_to_cedar() record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) - assert record.metadata == {'status': 'new'} + assert record.metadata == {'status': 'new', '@context': cedar_template.cedar_id} assert record.is_published is True def test_skips_submissions_without_required_template(self, provider_without_template): @@ -217,4 +217,5 @@ def test_all_cedar_fields_copied(self, provider_with_template, cedar_template): 'data_type': 'quantitative', 'disease': 'cancer', 'grade_levels': 'K-12', + '@context': cedar_template.cedar_id, } From b90c03afa0fa7c2291ade39ae644cad28d6f7481 Mon Sep 17 00:00:00 2001 From: Vlad0n20 Date: Tue, 9 Jun 2026 17:20:30 +0200 Subject: [PATCH 3/7] Remove TestCopyCollectionSubmissionMetadataToCedar tests --- ...collection_submission_metadata_to_cedar.py | 221 ------------------ 1 file changed, 221 deletions(-) delete mode 100644 osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py diff --git a/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py b/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py deleted file mode 100644 index 1f92df1e6da..00000000000 --- a/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py +++ /dev/null @@ -1,221 +0,0 @@ -import pytest -from faker import Faker -from unittest import mock - -from django.core.management import call_command - -from osf.models import CollectionSubmission, CedarMetadataRecord, CedarMetadataTemplate -from osf.management.commands.copy_collection_submission_metadata_to_cedar import ( - copy_collection_submission_metadata_to_cedar, -) -from osf_tests.factories import ( - CollectionFactory, - CollectionProviderFactory, - NodeFactory, -) -from tests.utils import capture_notifications - -fake = Faker() - - -def make_cedar_template(): - return CedarMetadataTemplate.objects.create( - schema_name=fake.bs(), - cedar_id=fake.md5(), - template_version=1, - template={}, - active=True, - ) - - -def make_collection(provider): - collection = CollectionFactory() - collection.provider = provider - collection.save() - return collection - - -def make_submission(collection, **fields): - node = NodeFactory(is_public=True) - submission = CollectionSubmission( - guid=node.guids.first(), - collection=collection, - creator=node.creator, - **fields, - ) - with capture_notifications(): - submission.save() - return submission - - -@pytest.fixture() -def cedar_template(): - return make_cedar_template() - - -@pytest.fixture() -def provider_with_template(cedar_template): - provider = CollectionProviderFactory() - provider.required_metadata_template = cedar_template - provider.save() - return provider - - -@pytest.fixture() -def provider_without_template(): - return CollectionProviderFactory() - - -@pytest.mark.django_db -class TestCopyCollectionSubmissionMetadataToCedar: - - def test_creates_record_for_submission_with_template(self, provider_with_template, cedar_template): - collection = make_collection(provider_with_template) - submission = make_submission(collection, collected_type='software', status='active') - - copy_collection_submission_metadata_to_cedar() - - assert CedarMetadataRecord.objects.filter( - guid=submission.guid, - template=cedar_template, - ).exists() - - def test_record_contains_non_empty_fields_only(self, provider_with_template, cedar_template): - collection = make_collection(provider_with_template) - submission = make_submission(collection, collected_type='dataset', status='', volume='') - - copy_collection_submission_metadata_to_cedar() - - record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) - assert record.metadata == {'collected_type': 'dataset', '@context': cedar_template.cedar_id} - - def test_record_is_published(self, provider_with_template, cedar_template): - collection = make_collection(provider_with_template) - submission = make_submission(collection, status='active') - - copy_collection_submission_metadata_to_cedar() - - record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) - assert record.is_published is True - - def test_updates_existing_record(self, provider_with_template, cedar_template): - collection = make_collection(provider_with_template) - submission = make_submission(collection, status='new') - CedarMetadataRecord.objects.create( - guid=submission.guid, - template=cedar_template, - metadata={'status': 'old'}, - is_published=False, - ) - - copy_collection_submission_metadata_to_cedar() - - record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) - assert record.metadata == {'status': 'new', '@context': cedar_template.cedar_id} - assert record.is_published is True - - def test_skips_submissions_without_required_template(self, provider_without_template): - collection = make_collection(provider_without_template) - submission = make_submission(collection, collected_type='software') - - copy_collection_submission_metadata_to_cedar() - - assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists() - - def test_dry_run_makes_no_changes(self, provider_with_template, cedar_template): - collection = make_collection(provider_with_template) - submission = make_submission(collection, collected_type='software') - - copy_collection_submission_metadata_to_cedar(dry_run=True) - - assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists() - - def test_provider_filter_processes_only_matching_provider(self, cedar_template): - provider_a = CollectionProviderFactory() - provider_a.required_metadata_template = cedar_template - provider_a.save() - - provider_b = CollectionProviderFactory() - provider_b.required_metadata_template = make_cedar_template() - provider_b.save() - - sub_a = make_submission(make_collection(provider_a), collected_type='software') - sub_b = make_submission(make_collection(provider_b), collected_type='dataset') - - copy_collection_submission_metadata_to_cedar(provider_id=provider_a._id) - - assert CedarMetadataRecord.objects.filter(guid=sub_a.guid, template=cedar_template).exists() - assert not CedarMetadataRecord.objects.filter(guid=sub_b.guid).exists() - - def test_error_on_one_does_not_stop_others(self, provider_with_template, cedar_template): - collection = make_collection(provider_with_template) - make_submission(collection, collected_type='software') - make_submission(collection, collected_type='dataset') - - call_count = 0 - original_sync = CollectionSubmission.sync_cedar_metadata - - def sync_side_effect(self): - nonlocal call_count - call_count += 1 - if call_count == 1: - raise Exception('simulated error') - original_sync(self) - - with mock.patch.object(CollectionSubmission, 'sync_cedar_metadata', sync_side_effect): - copy_collection_submission_metadata_to_cedar() - - assert call_count == 2 - assert CedarMetadataRecord.objects.filter(template=cedar_template).count() == 1 - - def test_call_command_interface(self, provider_with_template, cedar_template): - collection = make_collection(provider_with_template) - submission = make_submission(collection, collected_type='software') - - call_command('copy_collection_submission_metadata_to_cedar') - - assert CedarMetadataRecord.objects.filter( - guid=submission.guid, - template=cedar_template, - ).exists() - - def test_call_command_dry_run(self, provider_with_template, cedar_template): - collection = make_collection(provider_with_template) - submission = make_submission(collection, collected_type='software') - - call_command('copy_collection_submission_metadata_to_cedar', '--dry-run') - - assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists() - - def test_all_cedar_fields_copied(self, provider_with_template, cedar_template): - collection = make_collection(provider_with_template) - submission = make_submission( - collection, - collected_type='software', - status='active', - volume='1', - issue='2', - program_area='health', - school_type='university', - study_design='rct', - data_type='quantitative', - disease='cancer', - grade_levels='K-12', - ) - - copy_collection_submission_metadata_to_cedar() - - record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) - assert record.metadata == { - 'collected_type': 'software', - 'status': 'active', - 'volume': '1', - 'issue': '2', - 'program_area': 'health', - 'school_type': 'university', - 'study_design': 'rct', - 'data_type': 'quantitative', - 'disease': 'cancer', - 'grade_levels': 'K-12', - '@context': cedar_template.cedar_id, - } From 29e9b043185bd17f0f7002b06df5aef30014c02f Mon Sep 17 00:00:00 2001 From: Vlad0n20 Date: Tue, 9 Jun 2026 19:19:04 +0200 Subject: [PATCH 4/7] fix test --- osf_tests/metadata/test_serialized_metadata.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/osf_tests/metadata/test_serialized_metadata.py b/osf_tests/metadata/test_serialized_metadata.py index d3b38b540df..dc5a847c2e1 100644 --- a/osf_tests/metadata/test_serialized_metadata.py +++ b/osf_tests/metadata/test_serialized_metadata.py @@ -11,6 +11,7 @@ from osf.metrics.reports import PublicItemUsageReport from osf.metrics.utils import YearMonth from osf.models.licenses import NodeLicense +from osf.models.nodelog import NodeLog from api_tests.utils import create_test_file from osf_tests import factories from osf_tests.metadata._utils import assert_equivalent_turtle @@ -204,12 +205,14 @@ def setUp(self): super().setUp() # patch auto-generated fields into predictable values osfguid_sequence = OsfguidSequence('wibble') + _nodelog_date_field = NodeLog._meta.get_field('date') for patcher in ( mock.patch('osf.models.base.generate_guid', new=osfguid_sequence), mock.patch('osf.models.base.Guid.objects.get_or_create', new=osfguid_sequence.get_or_create), mock.patch('django.utils.timezone.now', new=forever_now), mock.patch('osf.models.mixins.timezone.now', new=forever_now), mock.patch('osf.models.nodelog.timezone.now', new=forever_now), + mock.patch.dict(_nodelog_date_field.__dict__, {'_get_default': forever_now}), mock.patch('osf.models.metaschema.RegistrationSchema.absolute_api_v2_url', new='http://fake.example/schema/for/test'), mock.patch('osf.models.node.Node.get_verified_links', return_value=[ {'target_url': 'https://foo.bar', 'resource_type': 'Other'} From 7e03a0307dc5f0182fff432754d58a8a725baf61 Mon Sep 17 00:00:00 2001 From: Vlad0n20 Date: Wed, 10 Jun 2026 19:00:11 +0200 Subject: [PATCH 5/7] Restore and fix tests --- osf/models/collection_submission.py | 21 ++++++ osf_tests/test_cedar_metadata_record.py | 93 +++++++++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 osf_tests/test_cedar_metadata_record.py diff --git a/osf/models/collection_submission.py b/osf/models/collection_submission.py index 86f52ebdce6..86fa3e98f78 100644 --- a/osf/models/collection_submission.py +++ b/osf/models/collection_submission.py @@ -474,6 +474,27 @@ def remove_from_index(self): logger.exception(e) sentry.log_exception(e) + CEDAR_METADATA_FIELDS = ( + 'collected_type', 'status', 'volume', 'issue', + 'program_area', 'school_type', 'study_design', + 'data_type', 'disease', 'grade_levels', + ) + + def sync_cedar_metadata(self): + from osf.models.cedar_metadata import CedarMetadataRecord + template = self.collection.provider.required_metadata_template + metadata = { + field: getattr(self, field) + for field in self.CEDAR_METADATA_FIELDS + if getattr(self, field) + } + metadata['@context'] = template.cedar_id + CedarMetadataRecord.objects.update_or_create( + guid=self.guid, + template=template, + defaults={'metadata': metadata, 'is_published': True}, + ) + def save(self, *args, **kwargs): ret = super().save(*args, **kwargs) self.update_search() diff --git a/osf_tests/test_cedar_metadata_record.py b/osf_tests/test_cedar_metadata_record.py new file mode 100644 index 00000000000..739c2afb762 --- /dev/null +++ b/osf_tests/test_cedar_metadata_record.py @@ -0,0 +1,93 @@ +import pytest +from faker import Faker +from django.core.exceptions import ValidationError + +from osf.models import CedarMetadataRecord, CedarMetadataTemplate +from osf_tests.factories import AuthUserFactory, ProjectFactory + +fake = Faker() + +CEDAR_SCHEMA = { + '$schema': 'http://json-schema.org/draft-07/schema#', + 'type': 'object', + 'properties': { + '@context': {'type': 'object'}, + 'School Type': { + 'type': 'object', + 'properties': {'@value': {'type': 'string'}}, + }, + }, + 'required': ['@id', 'pav:createdOn', 'pav:createdBy', 'pav:lastUpdatedOn', 'oslc:modifiedBy', 'School Type'], + 'additionalProperties': True, +} + + +@pytest.fixture() +def user(): + return AuthUserFactory() + + +@pytest.fixture() +def node(user): + return ProjectFactory(creator=user) + + +@pytest.fixture() +def cedar_template(): + return CedarMetadataTemplate.objects.create( + schema_name=fake.bs(), + cedar_id=fake.md5(), + template_version=1, + template=CEDAR_SCHEMA, + active=True, + ) + + +@pytest.mark.django_db +class TestCedarMetadataRecordClean: + + def _make_record(self, node, cedar_template, metadata, is_published=True): + record = CedarMetadataRecord( + guid=node.guids.first(), + template=cedar_template, + metadata=metadata, + is_published=is_published, + ) + return record + + def test_provenance_fields_stripped_from_required(self, node, cedar_template): + record = self._make_record(node, cedar_template, { + 'School Type': {'@value': 'High School'}, + }) + record.clean() + + def test_empty_dict_properties_stripped_from_metadata(self, node, cedar_template): + record = self._make_record(node, cedar_template, { + 'School Type': {'@value': 'High School'}, + '3de6ff2c-555b-44d4-84b6-3862188d29c9': {}, + }) + record.clean() + + def test_invalid_metadata_raises_validation_error(self, node, cedar_template): + record = self._make_record(node, cedar_template, { + 'School Type': 'not-an-object', + }) + with pytest.raises(ValidationError, match='does not validate against template'): + record.clean() + + def test_missing_non_provenance_required_field_raises(self, node, cedar_template): + record = self._make_record(node, cedar_template, {}) + with pytest.raises(ValidationError, match='does not validate against template'): + record.clean() + + def test_draft_record_skips_validation(self, node, cedar_template): + record = self._make_record(node, cedar_template, {}, is_published=False) + record.clean() + + def test_template_required_list_not_mutated(self, node, cedar_template): + original_required = list(cedar_template.template['required']) + record = self._make_record(node, cedar_template, { + 'School Type': {'@value': 'High School'}, + }) + record.clean() + assert cedar_template.template['required'] == original_required From 24451437f7f295a9b640972213caf819dba9ef35 Mon Sep 17 00:00:00 2001 From: Vlad0n20 Date: Fri, 12 Jun 2026 18:08:08 +0200 Subject: [PATCH 6/7] Add command tests --- ...collection_submission_metadata_to_cedar.py | 211 ++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py diff --git a/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py b/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py new file mode 100644 index 00000000000..abfd0a0f981 --- /dev/null +++ b/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py @@ -0,0 +1,211 @@ +import pytest +from faker import Faker +from unittest import mock + +from django.core.management import call_command + +from osf.management.commands.copy_collection_submission_metadata_to_cedar import ( + copy_collection_submission_metadata_to_cedar, +) +from osf.models import CollectionSubmission, CedarMetadataRecord, CedarMetadataTemplate +from osf_tests.factories import ( + CollectionFactory, + CollectionProviderFactory, + NodeFactory, +) +from tests.utils import capture_notifications + +fake = Faker() + + +def make_cedar_template(): + return CedarMetadataTemplate.objects.create( + schema_name=fake.bs(), + cedar_id=fake.md5(), + template_version=1, + template={}, + active=True, + ) + + +def make_collection(provider): + collection = CollectionFactory() + collection.provider = provider + collection.save() + return collection + + +def make_submission(collection, **fields): + node = NodeFactory(is_public=True) + submission = CollectionSubmission( + guid=node.guids.first(), + collection=collection, + creator=node.creator, + **fields, + ) + with capture_notifications(): + submission.save() + return submission + + +@pytest.fixture() +def cedar_template(): + return make_cedar_template() + + +@pytest.fixture() +def provider_with_template(cedar_template): + provider = CollectionProviderFactory() + provider.required_metadata_template = cedar_template + provider.save() + return provider + + +@pytest.fixture() +def provider_without_template(): + return CollectionProviderFactory() + + +@pytest.mark.django_db +class TestCopyCollectionSubmissionMetadataToCedar: + + def test_creates_record_for_submission_with_template(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, collected_type='software', status='active') + + copy_collection_submission_metadata_to_cedar() + + assert CedarMetadataRecord.objects.filter( + guid=submission.guid, + template=cedar_template, + ).exists() + + def test_record_contains_non_empty_fields_only(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, collected_type='dataset', status='', volume='') + + copy_collection_submission_metadata_to_cedar() + + record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) + assert record.metadata == {'collected_type': 'dataset', '@context': cedar_template.cedar_id} + + def test_record_is_published(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, status='active') + + copy_collection_submission_metadata_to_cedar() + + record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) + assert record.is_published is True + + def test_updates_existing_record(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, status='new') + CedarMetadataRecord.objects.create( + guid=submission.guid, + template=cedar_template, + metadata={'status': 'old'}, + is_published=False, + ) + + copy_collection_submission_metadata_to_cedar() + + record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) + assert record.metadata == {'status': 'new', '@context': cedar_template.cedar_id} + assert record.is_published is True + + def test_skips_submissions_without_required_template(self, provider_without_template): + collection = make_collection(provider_without_template) + submission = make_submission(collection, collected_type='software') + + copy_collection_submission_metadata_to_cedar() + + assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists() + + def test_dry_run_makes_no_changes(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, collected_type='software') + + copy_collection_submission_metadata_to_cedar(dry_run=True) + + assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists() + + def test_provider_filter_processes_only_matching_provider(self, cedar_template): + provider_a = CollectionProviderFactory() + provider_a.required_metadata_template = cedar_template + provider_a.save() + + provider_b = CollectionProviderFactory() + provider_b.required_metadata_template = make_cedar_template() + provider_b.save() + + sub_a = make_submission(make_collection(provider_a), collected_type='software') + sub_b = make_submission(make_collection(provider_b), collected_type='dataset') + + copy_collection_submission_metadata_to_cedar(provider_id=provider_a._id) + + assert CedarMetadataRecord.objects.filter(guid=sub_a.guid, template=cedar_template).exists() + assert not CedarMetadataRecord.objects.filter(guid=sub_b.guid).exists() + + def test_error_on_one_does_not_stop_others(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + make_submission(collection, collected_type='software') + make_submission(collection, collected_type='dataset') + + with mock.patch.object(CollectionSubmission, 'sync_cedar_metadata') as mock_sync: + mock_sync.side_effect = [Exception('simulated error'), None] + copy_collection_submission_metadata_to_cedar() + + assert mock_sync.call_count == 2 + + def test_call_command_interface(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, collected_type='software') + + call_command('copy_collection_submission_metadata_to_cedar') + + assert CedarMetadataRecord.objects.filter( + guid=submission.guid, + template=cedar_template, + ).exists() + + def test_call_command_dry_run(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, collected_type='software') + + call_command('copy_collection_submission_metadata_to_cedar', dry_run=True) + + assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists() + + def test_all_cedar_fields_copied(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission( + collection, + collected_type='software', + status='active', + volume='1', + issue='2', + program_area='health', + school_type='university', + study_design='rct', + data_type='quantitative', + disease='cancer', + grade_levels='K-12', + ) + + copy_collection_submission_metadata_to_cedar() + + record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) + assert record.metadata == { + 'collected_type': 'software', + 'status': 'active', + 'volume': '1', + 'issue': '2', + 'program_area': 'health', + 'school_type': 'university', + 'study_design': 'rct', + 'data_type': 'quantitative', + 'disease': 'cancer', + 'grade_levels': 'K-12', + '@context': cedar_template.cedar_id, + } From f90c3f6d53d28251d7a22a037308e6447b194c9f Mon Sep 17 00:00:00 2001 From: Vlad0n20 Date: Fri, 12 Jun 2026 18:44:22 +0200 Subject: [PATCH 7/7] Fix @context building and add validation in sync_cedar_metadata --- osf/models/collection_submission.py | 38 +++++++++++++++++-- ...collection_submission_metadata_to_cedar.py | 5 +-- 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/osf/models/collection_submission.py b/osf/models/collection_submission.py index 86fa3e98f78..d787f83d29d 100644 --- a/osf/models/collection_submission.py +++ b/osf/models/collection_submission.py @@ -22,6 +22,34 @@ logger = logging.getLogger(__name__) + +def _cedar_record_context(cedar_template_jsonschema): + try: + props = cedar_template_jsonschema['properties']['@context']['properties'] + except (KeyError, TypeError): + return None + return _cedar_record_context_obj(props) + + +def _cedar_record_context_obj(cedar_context_properties): + return { + prop: _cedar_record_context_val(prop_schema) + for prop, prop_schema in cedar_context_properties.items() + } + + +def _cedar_record_context_val(cedar_context_property_schema): + try: + return cedar_context_property_schema['enum'][0] + except (LookupError, TypeError): + pass + if cedar_context_property_schema.get('type') == 'object': + return _cedar_record_context_obj( + cedar_context_property_schema.get('properties', {}) + ) + raise ValueError(cedar_context_property_schema) + + class CollectionSubmission(TaxonomizableMixin, BaseModel): primary_identifier_name = 'guid___id' @@ -488,12 +516,16 @@ def sync_cedar_metadata(self): for field in self.CEDAR_METADATA_FIELDS if getattr(self, field) } - metadata['@context'] = template.cedar_id - CedarMetadataRecord.objects.update_or_create( + context = _cedar_record_context(template.template) + if context is not None: + metadata['@context'] = context + record, _ = CedarMetadataRecord.objects.get_or_create( guid=self.guid, template=template, - defaults={'metadata': metadata, 'is_published': True}, ) + record.metadata = metadata + record.is_published = True + record.save() def save(self, *args, **kwargs): ret = super().save(*args, **kwargs) diff --git a/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py b/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py index abfd0a0f981..b9a5bfe4caf 100644 --- a/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py +++ b/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py @@ -87,7 +87,7 @@ def test_record_contains_non_empty_fields_only(self, provider_with_template, ced copy_collection_submission_metadata_to_cedar() record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) - assert record.metadata == {'collected_type': 'dataset', '@context': cedar_template.cedar_id} + assert record.metadata == {'collected_type': 'dataset'} def test_record_is_published(self, provider_with_template, cedar_template): collection = make_collection(provider_with_template) @@ -111,7 +111,7 @@ def test_updates_existing_record(self, provider_with_template, cedar_template): copy_collection_submission_metadata_to_cedar() record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) - assert record.metadata == {'status': 'new', '@context': cedar_template.cedar_id} + assert record.metadata == {'status': 'new'} assert record.is_published is True def test_skips_submissions_without_required_template(self, provider_without_template): @@ -207,5 +207,4 @@ def test_all_cedar_fields_copied(self, provider_with_template, cedar_template): 'data_type': 'quantitative', 'disease': 'cancer', 'grade_levels': 'K-12', - '@context': cedar_template.cedar_id, }