diff --git a/etc/scripts/dataset_pipeline/annotate_composites.py b/etc/scripts/dataset_pipeline/annotate_composites.py new file mode 100644 index 0000000000..b3d449ff52 --- /dev/null +++ b/etc/scripts/dataset_pipeline/annotate_composites.py @@ -0,0 +1,165 @@ +# annotates composite (AND/OR) license rules with {{ }} required phrase markers +# uses scancode's license index to find license names in rule text +import re +from pathlib import Path +import click +from license_expression import Licensing +from licensedcode.models import Rule, load_licenses +from licensedcode.models import rules_data_dir as default_rules_data_dir +from licensedcode.required_phrases import add_required_phrase_to_rule + +MARKER_RE = re.compile(r'\{\{([^}]*)\}\}', re.DOTALL) +VERSION_SUFFIX_RE = re.compile(r'\s+v?\d[\d.]*(?:\s*(?:only|or[ -]later|\+))?$', re.IGNORECASE) + +# short forms that appear in rule text but arent in the license index +EXTRA_NAMES = { + 'gpl-2.0': ['GPL-2.0', 'GPLv2', 'GPL 2.0', 'GPL version 2', 'GPL Version 2'], + 'gpl-2.0-plus': ['GPL-2.0+', 'GPLv2+', 'GPL 2.0 or later'], + 'gpl-3.0': ['GPL-3.0', 'GPLv3', 'GPL 3.0', 'GPL version 3'], + 'gpl-3.0-plus': ['GPL-3.0+', 'GPLv3+', 'GPL 3.0 or later'], + 'lgpl-2.0-plus': ['LGPL-2.0', 'LGPL-2.0+', 'LGPL 2.0'], + 'lgpl-2.1': ['LGPL-2.1', 'LGPLv2.1', 'LGPL 2.1'], + 'lgpl-2.1-plus': ['LGPL-2.1+', 'LGPLv2.1+', 'LGPL 2.1 or later'], + 'lgpl-3.0': ['LGPL-3.0', 'LGPLv3', 'LGPL 3.0'], + 'lgpl-3.0-plus': ['LGPL-3.0+', 'LGPLv3+'], + 'agpl-3.0': ['AGPL-3.0', 'AGPLv3', 'AGPL 3.0'], + 'agpl-3.0-plus': ['AGPL-3.0+', 'AGPLv3+'], + 'mpl-1.1': ['MPL-1.1', 'MPL 1.1'], + 'mpl-2.0': ['MPL-2.0', 'MPL 2.0'], + 'apache-2.0': ['Apache-2.0', 'Apache 2.0'], + 'bsd-new': ['BSD-3-Clause', 'BSD 3-Clause', 'BSD-3-clause'], + 'bsd-simplified': ['BSD-2-Clause', 'BSD 2-Clause', 'BSD 2-clause'], + 'mit': ['MIT License', 'MIT license', 'MIT'], + 'isc': ['ISC License', 'ISC license', 'ISC'], + 'artistic-2.0': ['Artistic-2.0', 'Artistic 2.0'], + 'epl-1.0': ['EPL-1.0', 'EPL 1.0'], + 'epl-2.0': ['EPL-2.0', 'EPL 2.0'], + 'cc-by-4.0': ['CC-BY-4.0', 'CC BY 4.0'], + 'unlicense': ['Unlicense', 'UNLICENSE'], +} + + +def strip_version_suffix(name): + """remove trailing version number from a license name""" + result = VERSION_SUFFIX_RE.sub('', name).strip() + if len(result) < 10 or result == name: + return None + return result + + +def get_candidate_names(lic): + """collect names to search for, longest first""" + names = [] + if lic.name: + names.append(lic.name) + base = strip_version_suffix(lic.name) + if base: + names.append(base) + if lic.short_name and lic.short_name not in names: + names.append(lic.short_name) + if lic.spdx_license_key and lic.spdx_license_key not in names: + names.append(lic.spdx_license_key) + if lic.key not in names: + names.append(lic.key) + for e in EXTRA_NAMES.get(lic.key, []): + if e not in names: + names.append(e) + names.sort(key=len, reverse=True) + return names + + +def find_in_text(text, candidates): + """case insensitive search, returns the matched text in original case""" + text_lower = text.lower() + for name in candidates: + if not name or len(name) < 3: + continue + pos = text_lower.find(name.lower()) + if pos != -1: + return text[pos:pos + len(name)] + return None + + +@click.command() +@click.option('--rules-dir', type=click.Path(exists=True), default=None) +@click.option('--expression-filter', default=None, + help='only process rules containing this in their expression') +@click.option('--limit', type=int, default=None) +@click.option('--dry-run', is_flag=True) +def main(rules_dir, expression_filter, limit, dry_run): + """Annotate composite license rules with required phrase markers""" + if not rules_dir: + repo_rules = Path(__file__).resolve().parents[3] / 'src' / 'licensedcode' / 'data' / 'rules' + rules_dir = str(repo_rules) if repo_rules.is_dir() else default_rules_data_dir + + rules_path = Path(rules_dir) + licenses_db = load_licenses() + licensing = Licensing(list(licenses_db.values())) + + processed = 0 + annotated = 0 + + for rf in sorted(rules_path.glob('*.RULE')): + if limit and processed >= limit: + break + stem = rf.stem + if '_or_' not in stem and '_and_' not in stem: + continue + try: + rule = Rule.from_file(rule_file=str(rf)) + except Exception: + continue + if not rule.license_expression: + continue + if getattr(rule, 'is_required_phrase', False): + continue + if getattr(rule, 'skip_for_required_phrase_generation', False): + continue + if not getattr(rule, 'is_approx_matchable', True): + continue + text = rule.text or '' + if MARKER_RE.search(text): + continue + if expression_filter and expression_filter not in rule.license_expression: + continue + + processed += 1 + try: + keys = licensing.license_keys(rule.license_expression, unique=True) + except Exception: + continue + if not keys: + continue + + # find each license name in the text + phrases = [] + found_all = True + for key in keys: + lic = licenses_db.get(key) + if not lic: + found_all = False + break + match = find_in_text(text, get_candidate_names(lic)) + if not match: + found_all = False + break + phrases.append(match) + + if not found_all: + continue + + added = False + for phrase in phrases: + if add_required_phrase_to_rule(rule=rule, required_phrase=phrase, + source='composite_annotation', dry_run=dry_run): + added = True + if added: + annotated += 1 + if dry_run: + click.echo(f' {rule.identifier}: {phrases}') + + click.echo(f'\ndone - {annotated}/{processed} annotated') + + +if __name__ == '__main__': + main() diff --git a/etc/scripts/dataset_pipeline/test_annotate_composites.py b/etc/scripts/dataset_pipeline/test_annotate_composites.py new file mode 100644 index 0000000000..053bc9a75a --- /dev/null +++ b/etc/scripts/dataset_pipeline/test_annotate_composites.py @@ -0,0 +1,96 @@ +# tests for annotate_composites.py +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent)) +from annotate_composites import strip_version_suffix, get_candidate_names, find_in_text + + +class TestStripVersionSuffix: + + def test_strips_trailing_version(self): + assert strip_version_suffix('GNU General Public License 3.0') == 'GNU General Public License' + + def test_strips_version_with_v_prefix(self): + assert strip_version_suffix('Mozilla Public License v2.0') == 'Mozilla Public License' + + def test_strips_or_later(self): + result = strip_version_suffix('GNU Lesser General Public License 2.1 or later') + assert result == 'GNU Lesser General Public License' + + def test_returns_none_when_result_too_short(self): + assert strip_version_suffix('EPL 1.0') is None + + def test_returns_none_when_no_version(self): + assert strip_version_suffix('Boost Software License') is None + + +class FakeLicense: + """minimal mock for License objects""" + def __init__(self, key, name=None, short_name=None, spdx_license_key=None): + self.key = key + self.name = name + self.short_name = short_name + self.spdx_license_key = spdx_license_key + + +class TestGetCandidateNames: + + def test_collects_all_fields(self): + lic = FakeLicense( + key='apache-2.0', + name='Apache License 2.0', + short_name='Apache 2.0', + spdx_license_key='Apache-2.0', + ) + names = get_candidate_names(lic) + assert 'Apache License 2.0' in names + assert 'Apache 2.0' in names + assert 'Apache-2.0' in names + assert 'apache-2.0' in names + + def test_includes_extra_names_for_known_keys(self): + lic = FakeLicense(key='gpl-2.0', name='GNU General Public License 2.0') + names = get_candidate_names(lic) + assert 'GPLv2' in names + assert 'GPL 2.0' in names + + def test_includes_version_stripped_base(self): + lic = FakeLicense(key='agpl-3.0', name='GNU Affero General Public License 3.0') + names = get_candidate_names(lic) + assert 'GNU Affero General Public License' in names + + def test_sorted_longest_first(self): + lic = FakeLicense(key='mit', name='MIT License', short_name='MIT') + names = get_candidate_names(lic) + for i in range(len(names) - 1): + assert len(names[i]) >= len(names[i + 1]) + + def test_no_duplicates(self): + lic = FakeLicense(key='isc', name='ISC License', short_name='ISC License', + spdx_license_key='ISC') + names = get_candidate_names(lic) + assert len(names) == len(set(names)) + + +class TestFindInText: + + def test_basic_match(self): + text = 'Licensed under the Apache License, Version 2.0' + assert find_in_text(text, ['Apache License']) == 'Apache License' + + def test_case_insensitive(self): + text = 'distributed under the gnu general public license' + result = find_in_text(text, ['GNU General Public License']) + assert result == 'gnu general public license' + + def test_not_found(self): + text = 'this software uses the BSD license' + assert find_in_text(text, ['Apache License', 'MIT License']) is None + + def test_skips_very_short_candidates(self): + assert find_in_text('released under XY terms', ['XY']) is None + + def test_longer_candidate_matched_first(self): + text = 'Released under the MIT License' + assert find_in_text(text, ['MIT License', 'MIT']) == 'MIT License'