From 2cc6b5b99e01b8c260597e9976a17a3c5befd963 Mon Sep 17 00:00:00 2001 From: Kaushik Date: Fri, 5 Jun 2026 14:31:56 +0000 Subject: [PATCH 1/3] Add script to annotate composite license rules with required phrases Uses scancode license index to find license names in AND/OR rule text and inject {{ }} markers via add_required_phrase_to_rule(). Signed-off-by: Kaushik --- .../dataset_pipeline/annotate_composites.py | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 etc/scripts/dataset_pipeline/annotate_composites.py diff --git a/etc/scripts/dataset_pipeline/annotate_composites.py b/etc/scripts/dataset_pipeline/annotate_composites.py new file mode 100644 index 0000000000..4f4de5df00 --- /dev/null +++ b/etc/scripts/dataset_pipeline/annotate_composites.py @@ -0,0 +1,164 @@ +# annotates composite (AND/OR) license rules with {{ }} required phrase markers +# uses scancode's license index to find license names in rule text +import re +from pathlib import Path +import click +from license_expression import Licensing +from licensedcode.models import Rule, load_licenses +from licensedcode.models import rules_data_dir as default_rules_data_dir +from licensedcode.required_phrases import add_required_phrase_to_rule + +MARKER_RE = re.compile(r'\{\{([^}]*)\}\}', re.DOTALL) +VERSION_SUFFIX_RE = re.compile(r'\s+v?\d[\d.]*(?:\s*(?:only|or[ -]later|\+))?$', re.IGNORECASE) + +# extra short forms used in rule text that the license index doesnt have +EXTRA_NAMES = { + 'gpl-2.0': ['GPL-2.0', 'GPLv2', 'GPL 2.0', 'GPL version 2'], + 'gpl-2.0-plus': ['GPL-2.0+', 'GPLv2+', 'GPL 2.0 or later'], + 'gpl-3.0': ['GPL-3.0', 'GPLv3', 'GPL 3.0', 'GPL version 3'], + 'gpl-3.0-plus': ['GPL-3.0+', 'GPLv3+', 'GPL 3.0 or later'], + 'lgpl-2.1': ['LGPL-2.1', 'LGPLv2.1', 'LGPL 2.1'], + 'lgpl-2.1-plus': ['LGPL-2.1+', 'LGPLv2.1+'], + 'lgpl-3.0': ['LGPL-3.0', 'LGPLv3', 'LGPL 3.0'], + 'lgpl-3.0-plus': ['LGPL-3.0+', 'LGPLv3+'], + 'agpl-3.0': ['AGPL-3.0', 'AGPLv3', 'AGPL 3.0'], + 'agpl-3.0-plus': ['AGPL-3.0+', 'AGPLv3+'], + 'mpl-2.0': ['MPL-2.0', 'MPL 2.0'], + 'apache-2.0': ['Apache-2.0', 'Apache 2.0'], + 'bsd-new': ['BSD-3-Clause', 'BSD 3-Clause'], + 'bsd-simplified': ['BSD-2-Clause', 'BSD 2-Clause'], + 'mit': ['MIT License', 'MIT license', 'MIT'], + 'isc': ['ISC License', 'ISC license', 'ISC'], + 'artistic-2.0': ['Artistic-2.0', 'Artistic 2.0'], + 'epl-1.0': ['EPL-1.0', 'EPL 1.0'], + 'epl-2.0': ['EPL-2.0', 'EPL 2.0'], + 'cc-by-4.0': ['CC-BY-4.0', 'CC BY 4.0'], + 'unlicense': ['Unlicense'], +} + + +def strip_version_suffix(name): + """removing trailing version from a license name""" + result = VERSION_SUFFIX_RE.sub('', name).strip() + if len(result) < 10 or result == name: + return None + return result + + +def get_candidate_names(lic): + """collect names to search for.longest first""" + names = [] + if lic.name: + names.append(lic.name) + base = strip_version_suffix(lic.name) + if base: + names.append(base) + if lic.short_name and lic.short_name not in names: + names.append(lic.short_name) + if lic.spdx_license_key and lic.spdx_license_key not in names: + names.append(lic.spdx_license_key) + if lic.key not in names: + names.append(lic.key) + for e in EXTRA_NAMES.get(lic.key, []): + if e not in names: + names.append(e) + names.sort(key=len, reverse=True) + return names + + +def find_in_text(text, candidates): + """case insensitive search.returns matched span having original case""" + text_lower = text.lower() + for name in candidates: + if not name or len(name) < 3: + continue + pos = text_lower.find(name.lower()) + if pos != -1: + return text[pos:pos + len(name)] + return None + + +@click.command() +@click.option('--rules-dir', type=click.Path(exists=True), default=None) +@click.option('--expression-filter', default=None, + help='only process rules containing this in their expression') +@click.option('--limit', type=int, default=None) +@click.option('--dry-run', is_flag=True) +def main(rules_dir, expression_filter, limit, dry_run): + """annotate rules with required phrase markers""" + if not rules_dir: + repo_rules = Path(__file__).resolve().parents[3] / 'src' / 'licensedcode' / 'data' / 'rules' + rules_dir = str(repo_rules) if repo_rules.is_dir() else default_rules_data_dir + + rules_path = Path(rules_dir) + licenses_db = load_licenses() + licensing = Licensing(list(licenses_db.values())) + + processed = 0 + annotated = 0 + + for rf in sorted(rules_path.glob('*.RULE')): + if limit and processed >= limit: + break + stem = rf.stem + if '_or_' not in stem and '_and_' not in stem: + continue + try: + rule = Rule.from_file(rule_file=str(rf)) + except Exception: + continue + if not rule.license_expression: + continue + if getattr(rule, 'is_required_phrase', False): + continue + text = rule.text or '' + if MARKER_RE.search(text): + continue + if expression_filter and expression_filter not in rule.license_expression: + continue + + processed += 1 + try: + keys = licensing.license_keys(rule.license_expression, unique=True) + except Exception: + continue + if not keys: + continue + + # find each license name in the text + phrases = [] + found_all = True + for key in keys: + lic = licenses_db.get(key) + if not lic: + found_all = False + break + match = find_in_text(text, get_candidate_names(lic)) + if not match: + found_all = False + break + phrases.append(match) + + if not found_all: + continue + + added = False + for phrase in phrases: + if add_required_phrase_to_rule(rule=rule, required_phrase=phrase, + source='composite_annotation', dry_run=dry_run): + added = True + if added: + annotated += 1 + if dry_run: + click.echo(f' {rule.identifier}: {phrases}') + + click.echo(f'\ndone - {annotated}/{processed} annotated') + + +if __name__ == '__main__': + main() + +# commands: +# python etc/scripts/dataset_pipeline/annotate_composites.py --dry-run +# python etc/scripts/dataset_pipeline/annotate_composites.py --expression-filter apache --limit 20 +# python etc/scripts/dataset_pipeline/annotate_composites.py From c1c885801be65a0e640ff661a36d98469d50a3a2 Mon Sep 17 00:00:00 2001 From: Kaushik Date: Sun, 14 Jun 2026 20:17:07 +0000 Subject: [PATCH 2/3] Add safety filters and more name variants --- .../dataset_pipeline/annotate_composites.py | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/etc/scripts/dataset_pipeline/annotate_composites.py b/etc/scripts/dataset_pipeline/annotate_composites.py index 4f4de5df00..b3d449ff52 100644 --- a/etc/scripts/dataset_pipeline/annotate_composites.py +++ b/etc/scripts/dataset_pipeline/annotate_composites.py @@ -11,34 +11,36 @@ MARKER_RE = re.compile(r'\{\{([^}]*)\}\}', re.DOTALL) VERSION_SUFFIX_RE = re.compile(r'\s+v?\d[\d.]*(?:\s*(?:only|or[ -]later|\+))?$', re.IGNORECASE) -# extra short forms used in rule text that the license index doesnt have +# short forms that appear in rule text but arent in the license index EXTRA_NAMES = { - 'gpl-2.0': ['GPL-2.0', 'GPLv2', 'GPL 2.0', 'GPL version 2'], + 'gpl-2.0': ['GPL-2.0', 'GPLv2', 'GPL 2.0', 'GPL version 2', 'GPL Version 2'], 'gpl-2.0-plus': ['GPL-2.0+', 'GPLv2+', 'GPL 2.0 or later'], 'gpl-3.0': ['GPL-3.0', 'GPLv3', 'GPL 3.0', 'GPL version 3'], 'gpl-3.0-plus': ['GPL-3.0+', 'GPLv3+', 'GPL 3.0 or later'], + 'lgpl-2.0-plus': ['LGPL-2.0', 'LGPL-2.0+', 'LGPL 2.0'], 'lgpl-2.1': ['LGPL-2.1', 'LGPLv2.1', 'LGPL 2.1'], - 'lgpl-2.1-plus': ['LGPL-2.1+', 'LGPLv2.1+'], + 'lgpl-2.1-plus': ['LGPL-2.1+', 'LGPLv2.1+', 'LGPL 2.1 or later'], 'lgpl-3.0': ['LGPL-3.0', 'LGPLv3', 'LGPL 3.0'], 'lgpl-3.0-plus': ['LGPL-3.0+', 'LGPLv3+'], 'agpl-3.0': ['AGPL-3.0', 'AGPLv3', 'AGPL 3.0'], 'agpl-3.0-plus': ['AGPL-3.0+', 'AGPLv3+'], + 'mpl-1.1': ['MPL-1.1', 'MPL 1.1'], 'mpl-2.0': ['MPL-2.0', 'MPL 2.0'], 'apache-2.0': ['Apache-2.0', 'Apache 2.0'], - 'bsd-new': ['BSD-3-Clause', 'BSD 3-Clause'], - 'bsd-simplified': ['BSD-2-Clause', 'BSD 2-Clause'], + 'bsd-new': ['BSD-3-Clause', 'BSD 3-Clause', 'BSD-3-clause'], + 'bsd-simplified': ['BSD-2-Clause', 'BSD 2-Clause', 'BSD 2-clause'], 'mit': ['MIT License', 'MIT license', 'MIT'], 'isc': ['ISC License', 'ISC license', 'ISC'], 'artistic-2.0': ['Artistic-2.0', 'Artistic 2.0'], 'epl-1.0': ['EPL-1.0', 'EPL 1.0'], 'epl-2.0': ['EPL-2.0', 'EPL 2.0'], 'cc-by-4.0': ['CC-BY-4.0', 'CC BY 4.0'], - 'unlicense': ['Unlicense'], + 'unlicense': ['Unlicense', 'UNLICENSE'], } def strip_version_suffix(name): - """removing trailing version from a license name""" + """remove trailing version number from a license name""" result = VERSION_SUFFIX_RE.sub('', name).strip() if len(result) < 10 or result == name: return None @@ -46,7 +48,7 @@ def strip_version_suffix(name): def get_candidate_names(lic): - """collect names to search for.longest first""" + """collect names to search for, longest first""" names = [] if lic.name: names.append(lic.name) @@ -67,7 +69,7 @@ def get_candidate_names(lic): def find_in_text(text, candidates): - """case insensitive search.returns matched span having original case""" + """case insensitive search, returns the matched text in original case""" text_lower = text.lower() for name in candidates: if not name or len(name) < 3: @@ -85,7 +87,7 @@ def find_in_text(text, candidates): @click.option('--limit', type=int, default=None) @click.option('--dry-run', is_flag=True) def main(rules_dir, expression_filter, limit, dry_run): - """annotate rules with required phrase markers""" + """Annotate composite license rules with required phrase markers""" if not rules_dir: repo_rules = Path(__file__).resolve().parents[3] / 'src' / 'licensedcode' / 'data' / 'rules' rules_dir = str(repo_rules) if repo_rules.is_dir() else default_rules_data_dir @@ -111,6 +113,10 @@ def main(rules_dir, expression_filter, limit, dry_run): continue if getattr(rule, 'is_required_phrase', False): continue + if getattr(rule, 'skip_for_required_phrase_generation', False): + continue + if not getattr(rule, 'is_approx_matchable', True): + continue text = rule.text or '' if MARKER_RE.search(text): continue @@ -157,8 +163,3 @@ def main(rules_dir, expression_filter, limit, dry_run): if __name__ == '__main__': main() - -# commands: -# python etc/scripts/dataset_pipeline/annotate_composites.py --dry-run -# python etc/scripts/dataset_pipeline/annotate_composites.py --expression-filter apache --limit 20 -# python etc/scripts/dataset_pipeline/annotate_composites.py From baa56a6d08b2d3f4f9f9b9d1ba99e20c75c85da8 Mon Sep 17 00:00:00 2001 From: Kaushik Date: Sun, 14 Jun 2026 21:01:21 +0000 Subject: [PATCH 3/3] Add tests for composite rule annotation script --- .../test_annotate_composites.py | 96 +++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 etc/scripts/dataset_pipeline/test_annotate_composites.py diff --git a/etc/scripts/dataset_pipeline/test_annotate_composites.py b/etc/scripts/dataset_pipeline/test_annotate_composites.py new file mode 100644 index 0000000000..053bc9a75a --- /dev/null +++ b/etc/scripts/dataset_pipeline/test_annotate_composites.py @@ -0,0 +1,96 @@ +# tests for annotate_composites.py +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent)) +from annotate_composites import strip_version_suffix, get_candidate_names, find_in_text + + +class TestStripVersionSuffix: + + def test_strips_trailing_version(self): + assert strip_version_suffix('GNU General Public License 3.0') == 'GNU General Public License' + + def test_strips_version_with_v_prefix(self): + assert strip_version_suffix('Mozilla Public License v2.0') == 'Mozilla Public License' + + def test_strips_or_later(self): + result = strip_version_suffix('GNU Lesser General Public License 2.1 or later') + assert result == 'GNU Lesser General Public License' + + def test_returns_none_when_result_too_short(self): + assert strip_version_suffix('EPL 1.0') is None + + def test_returns_none_when_no_version(self): + assert strip_version_suffix('Boost Software License') is None + + +class FakeLicense: + """minimal mock for License objects""" + def __init__(self, key, name=None, short_name=None, spdx_license_key=None): + self.key = key + self.name = name + self.short_name = short_name + self.spdx_license_key = spdx_license_key + + +class TestGetCandidateNames: + + def test_collects_all_fields(self): + lic = FakeLicense( + key='apache-2.0', + name='Apache License 2.0', + short_name='Apache 2.0', + spdx_license_key='Apache-2.0', + ) + names = get_candidate_names(lic) + assert 'Apache License 2.0' in names + assert 'Apache 2.0' in names + assert 'Apache-2.0' in names + assert 'apache-2.0' in names + + def test_includes_extra_names_for_known_keys(self): + lic = FakeLicense(key='gpl-2.0', name='GNU General Public License 2.0') + names = get_candidate_names(lic) + assert 'GPLv2' in names + assert 'GPL 2.0' in names + + def test_includes_version_stripped_base(self): + lic = FakeLicense(key='agpl-3.0', name='GNU Affero General Public License 3.0') + names = get_candidate_names(lic) + assert 'GNU Affero General Public License' in names + + def test_sorted_longest_first(self): + lic = FakeLicense(key='mit', name='MIT License', short_name='MIT') + names = get_candidate_names(lic) + for i in range(len(names) - 1): + assert len(names[i]) >= len(names[i + 1]) + + def test_no_duplicates(self): + lic = FakeLicense(key='isc', name='ISC License', short_name='ISC License', + spdx_license_key='ISC') + names = get_candidate_names(lic) + assert len(names) == len(set(names)) + + +class TestFindInText: + + def test_basic_match(self): + text = 'Licensed under the Apache License, Version 2.0' + assert find_in_text(text, ['Apache License']) == 'Apache License' + + def test_case_insensitive(self): + text = 'distributed under the gnu general public license' + result = find_in_text(text, ['GNU General Public License']) + assert result == 'gnu general public license' + + def test_not_found(self): + text = 'this software uses the BSD license' + assert find_in_text(text, ['Apache License', 'MIT License']) is None + + def test_skips_very_short_candidates(self): + assert find_in_text('released under XY terms', ['XY']) is None + + def test_longer_candidate_matched_first(self): + text = 'Released under the MIT License' + assert find_in_text(text, ['MIT License', 'MIT']) == 'MIT License'