diff --git a/osf/management/commands/copy_collection_submission_metadata_to_cedar.py b/osf/management/commands/copy_collection_submission_metadata_to_cedar.py new file mode 100644 index 00000000000..255228b4c96 --- /dev/null +++ b/osf/management/commands/copy_collection_submission_metadata_to_cedar.py @@ -0,0 +1,73 @@ +import logging + +from django.core.management.base import BaseCommand + +from osf.models import CollectionSubmission + +logger = logging.getLogger(__name__) + + +def copy_collection_submission_metadata_to_cedar(dry_run=False, batch_size=100, provider_id=None): + qs = CollectionSubmission.objects.filter( + collection__provider__required_metadata_template__isnull=False, + ).select_related( + 'guid', + 'collection__provider__required_metadata_template', + ) + + if provider_id: + qs = qs.filter(collection__provider___id=provider_id) + + total = qs.count() + logger.info(f'{"[DRY RUN] " if dry_run else ""}Found {total} collection submissions to process') + + processed = errors = 0 + for submission in qs.iterator(chunk_size=batch_size): + if dry_run: + logger.info(f'[DRY RUN] Would sync cedar metadata for submission {submission._id}') + continue + try: + submission.sync_cedar_metadata() + processed += 1 + except Exception as e: + logger.error(f'Failed to sync cedar metadata for submission {submission._id}: {e}') + errors += 1 + + logger.info( + f'{"[DRY RUN] " if dry_run else ""}' + f'Done. Processed {processed}/{total} submissions' + f'{f", {errors} error(s)" if errors else ""}' + ) + + +class Command(BaseCommand): + help = 'Copy CollectionSubmission custom metadata fields to CedarMetadataRecord for providers with a required cedar template.' + + def add_arguments(self, parser): + super().add_arguments(parser) + parser.add_argument( + '--dry-run', + action='store_true', + dest='dry_run', + help='Preview what would be synced without making any changes', + ) + parser.add_argument( + '--batch-size', + type=int, + default=100, + dest='batch_size', + help='Number of submissions to process per iteration (default: 100)', + ) + parser.add_argument( + '--provider', + type=str, + dest='provider_id', + help='Optional collection provider _id to limit processing to a single provider', + ) + + def handle(self, *args, **options): + copy_collection_submission_metadata_to_cedar( + dry_run=options['dry_run'], + batch_size=options['batch_size'], + provider_id=options.get('provider_id'), + ) diff --git a/osf/models/collection_submission.py b/osf/models/collection_submission.py index 86f52ebdce6..d787f83d29d 100644 --- a/osf/models/collection_submission.py +++ b/osf/models/collection_submission.py @@ -22,6 +22,34 @@ logger = logging.getLogger(__name__) + +def _cedar_record_context(cedar_template_jsonschema): + try: + props = cedar_template_jsonschema['properties']['@context']['properties'] + except (KeyError, TypeError): + return None + return _cedar_record_context_obj(props) + + +def _cedar_record_context_obj(cedar_context_properties): + return { + prop: _cedar_record_context_val(prop_schema) + for prop, prop_schema in cedar_context_properties.items() + } + + +def _cedar_record_context_val(cedar_context_property_schema): + try: + return cedar_context_property_schema['enum'][0] + except (LookupError, TypeError): + pass + if cedar_context_property_schema.get('type') == 'object': + return _cedar_record_context_obj( + cedar_context_property_schema.get('properties', {}) + ) + raise ValueError(cedar_context_property_schema) + + class CollectionSubmission(TaxonomizableMixin, BaseModel): primary_identifier_name = 'guid___id' @@ -474,6 +502,31 @@ def remove_from_index(self): logger.exception(e) sentry.log_exception(e) + CEDAR_METADATA_FIELDS = ( + 'collected_type', 'status', 'volume', 'issue', + 'program_area', 'school_type', 'study_design', + 'data_type', 'disease', 'grade_levels', + ) + + def sync_cedar_metadata(self): + from osf.models.cedar_metadata import CedarMetadataRecord + template = self.collection.provider.required_metadata_template + metadata = { + field: getattr(self, field) + for field in self.CEDAR_METADATA_FIELDS + if getattr(self, field) + } + context = _cedar_record_context(template.template) + if context is not None: + metadata['@context'] = context + record, _ = CedarMetadataRecord.objects.get_or_create( + guid=self.guid, + template=template, + ) + record.metadata = metadata + record.is_published = True + record.save() + def save(self, *args, **kwargs): ret = super().save(*args, **kwargs) self.update_search() diff --git a/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py b/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py new file mode 100644 index 00000000000..b9a5bfe4caf --- /dev/null +++ b/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py @@ -0,0 +1,210 @@ +import pytest +from faker import Faker +from unittest import mock + +from django.core.management import call_command + +from osf.management.commands.copy_collection_submission_metadata_to_cedar import ( + copy_collection_submission_metadata_to_cedar, +) +from osf.models import CollectionSubmission, CedarMetadataRecord, CedarMetadataTemplate +from osf_tests.factories import ( + CollectionFactory, + CollectionProviderFactory, + NodeFactory, +) +from tests.utils import capture_notifications + +fake = Faker() + + +def make_cedar_template(): + return CedarMetadataTemplate.objects.create( + schema_name=fake.bs(), + cedar_id=fake.md5(), + template_version=1, + template={}, + active=True, + ) + + +def make_collection(provider): + collection = CollectionFactory() + collection.provider = provider + collection.save() + return collection + + +def make_submission(collection, **fields): + node = NodeFactory(is_public=True) + submission = CollectionSubmission( + guid=node.guids.first(), + collection=collection, + creator=node.creator, + **fields, + ) + with capture_notifications(): + submission.save() + return submission + + +@pytest.fixture() +def cedar_template(): + return make_cedar_template() + + +@pytest.fixture() +def provider_with_template(cedar_template): + provider = CollectionProviderFactory() + provider.required_metadata_template = cedar_template + provider.save() + return provider + + +@pytest.fixture() +def provider_without_template(): + return CollectionProviderFactory() + + +@pytest.mark.django_db +class TestCopyCollectionSubmissionMetadataToCedar: + + def test_creates_record_for_submission_with_template(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, collected_type='software', status='active') + + copy_collection_submission_metadata_to_cedar() + + assert CedarMetadataRecord.objects.filter( + guid=submission.guid, + template=cedar_template, + ).exists() + + def test_record_contains_non_empty_fields_only(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, collected_type='dataset', status='', volume='') + + copy_collection_submission_metadata_to_cedar() + + record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) + assert record.metadata == {'collected_type': 'dataset'} + + def test_record_is_published(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, status='active') + + copy_collection_submission_metadata_to_cedar() + + record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) + assert record.is_published is True + + def test_updates_existing_record(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, status='new') + CedarMetadataRecord.objects.create( + guid=submission.guid, + template=cedar_template, + metadata={'status': 'old'}, + is_published=False, + ) + + copy_collection_submission_metadata_to_cedar() + + record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) + assert record.metadata == {'status': 'new'} + assert record.is_published is True + + def test_skips_submissions_without_required_template(self, provider_without_template): + collection = make_collection(provider_without_template) + submission = make_submission(collection, collected_type='software') + + copy_collection_submission_metadata_to_cedar() + + assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists() + + def test_dry_run_makes_no_changes(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, collected_type='software') + + copy_collection_submission_metadata_to_cedar(dry_run=True) + + assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists() + + def test_provider_filter_processes_only_matching_provider(self, cedar_template): + provider_a = CollectionProviderFactory() + provider_a.required_metadata_template = cedar_template + provider_a.save() + + provider_b = CollectionProviderFactory() + provider_b.required_metadata_template = make_cedar_template() + provider_b.save() + + sub_a = make_submission(make_collection(provider_a), collected_type='software') + sub_b = make_submission(make_collection(provider_b), collected_type='dataset') + + copy_collection_submission_metadata_to_cedar(provider_id=provider_a._id) + + assert CedarMetadataRecord.objects.filter(guid=sub_a.guid, template=cedar_template).exists() + assert not CedarMetadataRecord.objects.filter(guid=sub_b.guid).exists() + + def test_error_on_one_does_not_stop_others(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + make_submission(collection, collected_type='software') + make_submission(collection, collected_type='dataset') + + with mock.patch.object(CollectionSubmission, 'sync_cedar_metadata') as mock_sync: + mock_sync.side_effect = [Exception('simulated error'), None] + copy_collection_submission_metadata_to_cedar() + + assert mock_sync.call_count == 2 + + def test_call_command_interface(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, collected_type='software') + + call_command('copy_collection_submission_metadata_to_cedar') + + assert CedarMetadataRecord.objects.filter( + guid=submission.guid, + template=cedar_template, + ).exists() + + def test_call_command_dry_run(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, collected_type='software') + + call_command('copy_collection_submission_metadata_to_cedar', dry_run=True) + + assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists() + + def test_all_cedar_fields_copied(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission( + collection, + collected_type='software', + status='active', + volume='1', + issue='2', + program_area='health', + school_type='university', + study_design='rct', + data_type='quantitative', + disease='cancer', + grade_levels='K-12', + ) + + copy_collection_submission_metadata_to_cedar() + + record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) + assert record.metadata == { + 'collected_type': 'software', + 'status': 'active', + 'volume': '1', + 'issue': '2', + 'program_area': 'health', + 'school_type': 'university', + 'study_design': 'rct', + 'data_type': 'quantitative', + 'disease': 'cancer', + 'grade_levels': 'K-12', + } diff --git a/osf_tests/metadata/test_serialized_metadata.py b/osf_tests/metadata/test_serialized_metadata.py index d3b38b540df..dc5a847c2e1 100644 --- a/osf_tests/metadata/test_serialized_metadata.py +++ b/osf_tests/metadata/test_serialized_metadata.py @@ -11,6 +11,7 @@ from osf.metrics.reports import PublicItemUsageReport from osf.metrics.utils import YearMonth from osf.models.licenses import NodeLicense +from osf.models.nodelog import NodeLog from api_tests.utils import create_test_file from osf_tests import factories from osf_tests.metadata._utils import assert_equivalent_turtle @@ -204,12 +205,14 @@ def setUp(self): super().setUp() # patch auto-generated fields into predictable values osfguid_sequence = OsfguidSequence('wibble') + _nodelog_date_field = NodeLog._meta.get_field('date') for patcher in ( mock.patch('osf.models.base.generate_guid', new=osfguid_sequence), mock.patch('osf.models.base.Guid.objects.get_or_create', new=osfguid_sequence.get_or_create), mock.patch('django.utils.timezone.now', new=forever_now), mock.patch('osf.models.mixins.timezone.now', new=forever_now), mock.patch('osf.models.nodelog.timezone.now', new=forever_now), + mock.patch.dict(_nodelog_date_field.__dict__, {'_get_default': forever_now}), mock.patch('osf.models.metaschema.RegistrationSchema.absolute_api_v2_url', new='http://fake.example/schema/for/test'), mock.patch('osf.models.node.Node.get_verified_links', return_value=[ {'target_url': 'https://foo.bar', 'resource_type': 'Other'} diff --git a/osf_tests/test_cedar_metadata_record.py b/osf_tests/test_cedar_metadata_record.py new file mode 100644 index 00000000000..739c2afb762 --- /dev/null +++ b/osf_tests/test_cedar_metadata_record.py @@ -0,0 +1,93 @@ +import pytest +from faker import Faker +from django.core.exceptions import ValidationError + +from osf.models import CedarMetadataRecord, CedarMetadataTemplate +from osf_tests.factories import AuthUserFactory, ProjectFactory + +fake = Faker() + +CEDAR_SCHEMA = { + '$schema': 'http://json-schema.org/draft-07/schema#', + 'type': 'object', + 'properties': { + '@context': {'type': 'object'}, + 'School Type': { + 'type': 'object', + 'properties': {'@value': {'type': 'string'}}, + }, + }, + 'required': ['@id', 'pav:createdOn', 'pav:createdBy', 'pav:lastUpdatedOn', 'oslc:modifiedBy', 'School Type'], + 'additionalProperties': True, +} + + +@pytest.fixture() +def user(): + return AuthUserFactory() + + +@pytest.fixture() +def node(user): + return ProjectFactory(creator=user) + + +@pytest.fixture() +def cedar_template(): + return CedarMetadataTemplate.objects.create( + schema_name=fake.bs(), + cedar_id=fake.md5(), + template_version=1, + template=CEDAR_SCHEMA, + active=True, + ) + + +@pytest.mark.django_db +class TestCedarMetadataRecordClean: + + def _make_record(self, node, cedar_template, metadata, is_published=True): + record = CedarMetadataRecord( + guid=node.guids.first(), + template=cedar_template, + metadata=metadata, + is_published=is_published, + ) + return record + + def test_provenance_fields_stripped_from_required(self, node, cedar_template): + record = self._make_record(node, cedar_template, { + 'School Type': {'@value': 'High School'}, + }) + record.clean() + + def test_empty_dict_properties_stripped_from_metadata(self, node, cedar_template): + record = self._make_record(node, cedar_template, { + 'School Type': {'@value': 'High School'}, + '3de6ff2c-555b-44d4-84b6-3862188d29c9': {}, + }) + record.clean() + + def test_invalid_metadata_raises_validation_error(self, node, cedar_template): + record = self._make_record(node, cedar_template, { + 'School Type': 'not-an-object', + }) + with pytest.raises(ValidationError, match='does not validate against template'): + record.clean() + + def test_missing_non_provenance_required_field_raises(self, node, cedar_template): + record = self._make_record(node, cedar_template, {}) + with pytest.raises(ValidationError, match='does not validate against template'): + record.clean() + + def test_draft_record_skips_validation(self, node, cedar_template): + record = self._make_record(node, cedar_template, {}, is_published=False) + record.clean() + + def test_template_required_list_not_mutated(self, node, cedar_template): + original_required = list(cedar_template.template['required']) + record = self._make_record(node, cedar_template, { + 'School Type': {'@value': 'High School'}, + }) + record.clean() + assert cedar_template.template['required'] == original_required