From 2cc6b5b99e01b8c260597e9976a17a3c5befd963 Mon Sep 17 00:00:00 2001
From: Kaushik <kaushikrjpm10@gmail.com>
Date: Fri, 5 Jun 2026 14:31:56 +0000
Subject: [PATCH 1/3] Add script to annotate composite license rules with
 required phrases

Uses scancode license index to find license names in AND/OR rule text and inject {{ }} markers via add_required_phrase_to_rule().

Signed-off-by: Kaushik <kaushikrjpm10@gmail.com>
---
 .../dataset_pipeline/annotate_composites.py   | 164 ++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 etc/scripts/dataset_pipeline/annotate_composites.py

diff --git a/etc/scripts/dataset_pipeline/annotate_composites.py b/etc/scripts/dataset_pipeline/annotate_composites.py
new file mode 100644
index 0000000000..4f4de5df00
--- /dev/null
+++ b/etc/scripts/dataset_pipeline/annotate_composites.py
@@ -0,0 +1,164 @@
+# annotates composite (AND/OR) license rules with {{ }} required phrase markers
+# uses scancode's license index to find license names in rule text
+import re
+from pathlib import Path
+import click
+from license_expression import Licensing
+from licensedcode.models import Rule, load_licenses
+from licensedcode.models import rules_data_dir as default_rules_data_dir
+from licensedcode.required_phrases import add_required_phrase_to_rule
+
+MARKER_RE = re.compile(r'\{\{([^}]*)\}\}', re.DOTALL)
+VERSION_SUFFIX_RE = re.compile(r'\s+v?\d[\d.]*(?:\s*(?:only|or[ -]later|\+))?$', re.IGNORECASE)
+
+# extra short forms used in rule text that the license index doesnt have
+EXTRA_NAMES = {
+    'gpl-2.0': ['GPL-2.0', 'GPLv2', 'GPL 2.0', 'GPL version 2'],
+    'gpl-2.0-plus': ['GPL-2.0+', 'GPLv2+', 'GPL 2.0 or later'],
+    'gpl-3.0': ['GPL-3.0', 'GPLv3', 'GPL 3.0', 'GPL version 3'],
+    'gpl-3.0-plus': ['GPL-3.0+', 'GPLv3+', 'GPL 3.0 or later'],
+    'lgpl-2.1': ['LGPL-2.1', 'LGPLv2.1', 'LGPL 2.1'],
+    'lgpl-2.1-plus': ['LGPL-2.1+', 'LGPLv2.1+'],
+    'lgpl-3.0': ['LGPL-3.0', 'LGPLv3', 'LGPL 3.0'],
+    'lgpl-3.0-plus': ['LGPL-3.0+', 'LGPLv3+'],
+    'agpl-3.0': ['AGPL-3.0', 'AGPLv3', 'AGPL 3.0'],
+    'agpl-3.0-plus': ['AGPL-3.0+', 'AGPLv3+'],
+    'mpl-2.0': ['MPL-2.0', 'MPL 2.0'],
+    'apache-2.0': ['Apache-2.0', 'Apache 2.0'],
+    'bsd-new': ['BSD-3-Clause', 'BSD 3-Clause'],
+    'bsd-simplified': ['BSD-2-Clause', 'BSD 2-Clause'],
+    'mit': ['MIT License', 'MIT license', 'MIT'],
+    'isc': ['ISC License', 'ISC license', 'ISC'],
+    'artistic-2.0': ['Artistic-2.0', 'Artistic 2.0'],
+    'epl-1.0': ['EPL-1.0', 'EPL 1.0'],
+    'epl-2.0': ['EPL-2.0', 'EPL 2.0'],
+    'cc-by-4.0': ['CC-BY-4.0', 'CC BY 4.0'],
+    'unlicense': ['Unlicense'],
+}
+
+
+def strip_version_suffix(name):
+    """removing trailing version from a license name"""
+    result = VERSION_SUFFIX_RE.sub('', name).strip()
+    if len(result) < 10 or result == name:
+        return None
+    return result
+
+
+def get_candidate_names(lic):
+    """collect names to search for.longest first"""
+    names = []
+    if lic.name:
+        names.append(lic.name)
+        base = strip_version_suffix(lic.name)
+        if base:
+            names.append(base)
+    if lic.short_name and lic.short_name not in names:
+        names.append(lic.short_name)
+    if lic.spdx_license_key and lic.spdx_license_key not in names:
+        names.append(lic.spdx_license_key)
+    if lic.key not in names:
+        names.append(lic.key)
+    for e in EXTRA_NAMES.get(lic.key, []):
+        if e not in names:
+            names.append(e)
+    names.sort(key=len, reverse=True)
+    return names
+
+
+def find_in_text(text, candidates):
+    """case insensitive search.returns matched span having original case"""
+    text_lower = text.lower()
+    for name in candidates:
+        if not name or len(name) < 3:
+            continue
+        pos = text_lower.find(name.lower())
+        if pos != -1:
+            return text[pos:pos + len(name)]
+    return None
+
+
+@click.command()
+@click.option('--rules-dir', type=click.Path(exists=True), default=None)
+@click.option('--expression-filter', default=None,
+              help='only process rules containing this in their expression')
+@click.option('--limit', type=int, default=None)
+@click.option('--dry-run', is_flag=True)
+def main(rules_dir, expression_filter, limit, dry_run):
+    """annotate rules with required phrase markers"""
+    if not rules_dir:
+        repo_rules = Path(__file__).resolve().parents[3] / 'src' / 'licensedcode' / 'data' / 'rules'
+        rules_dir = str(repo_rules) if repo_rules.is_dir() else default_rules_data_dir
+
+    rules_path = Path(rules_dir)
+    licenses_db = load_licenses()
+    licensing = Licensing(list(licenses_db.values()))
+
+    processed = 0
+    annotated = 0
+
+    for rf in sorted(rules_path.glob('*.RULE')):
+        if limit and processed >= limit:
+            break
+        stem = rf.stem
+        if '_or_' not in stem and '_and_' not in stem:
+            continue
+        try:
+            rule = Rule.from_file(rule_file=str(rf))
+        except Exception:
+            continue
+        if not rule.license_expression:
+            continue
+        if getattr(rule, 'is_required_phrase', False):
+            continue
+        text = rule.text or ''
+        if MARKER_RE.search(text):
+            continue
+        if expression_filter and expression_filter not in rule.license_expression:
+            continue
+
+        processed += 1
+        try:
+            keys = licensing.license_keys(rule.license_expression, unique=True)
+        except Exception:
+            continue
+        if not keys:
+            continue
+
+        # find each license name in the text
+        phrases = []
+        found_all = True
+        for key in keys:
+            lic = licenses_db.get(key)
+            if not lic:
+                found_all = False
+                break
+            match = find_in_text(text, get_candidate_names(lic))
+            if not match:
+                found_all = False
+                break
+            phrases.append(match)
+
+        if not found_all:
+            continue
+
+        added = False
+        for phrase in phrases:
+            if add_required_phrase_to_rule(rule=rule, required_phrase=phrase,
+                                          source='composite_annotation', dry_run=dry_run):
+                added = True
+        if added:
+            annotated += 1
+            if dry_run:
+                click.echo(f'  {rule.identifier}: {phrases}')
+
+    click.echo(f'\ndone - {annotated}/{processed} annotated')
+
+
+if __name__ == '__main__':
+    main()
+
+# commands:
+# python etc/scripts/dataset_pipeline/annotate_composites.py --dry-run
+# python etc/scripts/dataset_pipeline/annotate_composites.py --expression-filter apache --limit 20
+# python etc/scripts/dataset_pipeline/annotate_composites.py

From c1c885801be65a0e640ff661a36d98469d50a3a2 Mon Sep 17 00:00:00 2001
From: Kaushik <kaushikrjpm10@gmail.com>
Date: Sun, 14 Jun 2026 20:17:07 +0000
Subject: [PATCH 2/3] Add safety filters and more name variants

---
 .../dataset_pipeline/annotate_composites.py   | 31 ++++++++++---------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/etc/scripts/dataset_pipeline/annotate_composites.py b/etc/scripts/dataset_pipeline/annotate_composites.py
index 4f4de5df00..b3d449ff52 100644
--- a/etc/scripts/dataset_pipeline/annotate_composites.py
+++ b/etc/scripts/dataset_pipeline/annotate_composites.py
@@ -11,34 +11,36 @@
 MARKER_RE = re.compile(r'\{\{([^}]*)\}\}', re.DOTALL)
 VERSION_SUFFIX_RE = re.compile(r'\s+v?\d[\d.]*(?:\s*(?:only|or[ -]later|\+))?$', re.IGNORECASE)
 
-# extra short forms used in rule text that the license index doesnt have
+# short forms that appear in rule text but arent in the license index
 EXTRA_NAMES = {
-    'gpl-2.0': ['GPL-2.0', 'GPLv2', 'GPL 2.0', 'GPL version 2'],
+    'gpl-2.0': ['GPL-2.0', 'GPLv2', 'GPL 2.0', 'GPL version 2', 'GPL Version 2'],
     'gpl-2.0-plus': ['GPL-2.0+', 'GPLv2+', 'GPL 2.0 or later'],
     'gpl-3.0': ['GPL-3.0', 'GPLv3', 'GPL 3.0', 'GPL version 3'],
     'gpl-3.0-plus': ['GPL-3.0+', 'GPLv3+', 'GPL 3.0 or later'],
+    'lgpl-2.0-plus': ['LGPL-2.0', 'LGPL-2.0+', 'LGPL 2.0'],
     'lgpl-2.1': ['LGPL-2.1', 'LGPLv2.1', 'LGPL 2.1'],
-    'lgpl-2.1-plus': ['LGPL-2.1+', 'LGPLv2.1+'],
+    'lgpl-2.1-plus': ['LGPL-2.1+', 'LGPLv2.1+', 'LGPL 2.1 or later'],
     'lgpl-3.0': ['LGPL-3.0', 'LGPLv3', 'LGPL 3.0'],
     'lgpl-3.0-plus': ['LGPL-3.0+', 'LGPLv3+'],
     'agpl-3.0': ['AGPL-3.0', 'AGPLv3', 'AGPL 3.0'],
     'agpl-3.0-plus': ['AGPL-3.0+', 'AGPLv3+'],
+    'mpl-1.1': ['MPL-1.1', 'MPL 1.1'],
     'mpl-2.0': ['MPL-2.0', 'MPL 2.0'],
     'apache-2.0': ['Apache-2.0', 'Apache 2.0'],
-    'bsd-new': ['BSD-3-Clause', 'BSD 3-Clause'],
-    'bsd-simplified': ['BSD-2-Clause', 'BSD 2-Clause'],
+    'bsd-new': ['BSD-3-Clause', 'BSD 3-Clause', 'BSD-3-clause'],
+    'bsd-simplified': ['BSD-2-Clause', 'BSD 2-Clause', 'BSD 2-clause'],
     'mit': ['MIT License', 'MIT license', 'MIT'],
     'isc': ['ISC License', 'ISC license', 'ISC'],
     'artistic-2.0': ['Artistic-2.0', 'Artistic 2.0'],
     'epl-1.0': ['EPL-1.0', 'EPL 1.0'],
     'epl-2.0': ['EPL-2.0', 'EPL 2.0'],
     'cc-by-4.0': ['CC-BY-4.0', 'CC BY 4.0'],
-    'unlicense': ['Unlicense'],
+    'unlicense': ['Unlicense', 'UNLICENSE'],
 }
 
 
 def strip_version_suffix(name):
-    """removing trailing version from a license name"""
+    """remove trailing version number from a license name"""
     result = VERSION_SUFFIX_RE.sub('', name).strip()
     if len(result) < 10 or result == name:
         return None
@@ -46,7 +48,7 @@ def strip_version_suffix(name):
 
 
 def get_candidate_names(lic):
-    """collect names to search for.longest first"""
+    """collect names to search for, longest first"""
     names = []
     if lic.name:
         names.append(lic.name)
@@ -67,7 +69,7 @@ def get_candidate_names(lic):
 
 
 def find_in_text(text, candidates):
-    """case insensitive search.returns matched span having original case"""
+    """case insensitive search, returns the matched text in original case"""
     text_lower = text.lower()
     for name in candidates:
         if not name or len(name) < 3:
@@ -85,7 +87,7 @@ def find_in_text(text, candidates):
 @click.option('--limit', type=int, default=None)
 @click.option('--dry-run', is_flag=True)
 def main(rules_dir, expression_filter, limit, dry_run):
-    """annotate rules with required phrase markers"""
+    """Annotate composite license rules with required phrase markers"""
     if not rules_dir:
         repo_rules = Path(__file__).resolve().parents[3] / 'src' / 'licensedcode' / 'data' / 'rules'
         rules_dir = str(repo_rules) if repo_rules.is_dir() else default_rules_data_dir
@@ -111,6 +113,10 @@ def main(rules_dir, expression_filter, limit, dry_run):
             continue
         if getattr(rule, 'is_required_phrase', False):
             continue
+        if getattr(rule, 'skip_for_required_phrase_generation', False):
+            continue
+        if not getattr(rule, 'is_approx_matchable', True):
+            continue
         text = rule.text or ''
         if MARKER_RE.search(text):
             continue
@@ -157,8 +163,3 @@ def main(rules_dir, expression_filter, limit, dry_run):
 
 if __name__ == '__main__':
     main()
-
-# commands:
-# python etc/scripts/dataset_pipeline/annotate_composites.py --dry-run
-# python etc/scripts/dataset_pipeline/annotate_composites.py --expression-filter apache --limit 20
-# python etc/scripts/dataset_pipeline/annotate_composites.py

From baa56a6d08b2d3f4f9f9b9d1ba99e20c75c85da8 Mon Sep 17 00:00:00 2001
From: Kaushik <kaushikrjpm10@gmail.com>
Date: Sun, 14 Jun 2026 21:01:21 +0000
Subject: [PATCH 3/3] Add tests for composite rule annotation script

---
 .../test_annotate_composites.py               | 96 +++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 etc/scripts/dataset_pipeline/test_annotate_composites.py

diff --git a/etc/scripts/dataset_pipeline/test_annotate_composites.py b/etc/scripts/dataset_pipeline/test_annotate_composites.py
new file mode 100644
index 0000000000..053bc9a75a
--- /dev/null
+++ b/etc/scripts/dataset_pipeline/test_annotate_composites.py
@@ -0,0 +1,96 @@
+# tests for annotate_composites.py
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent))
+from annotate_composites import strip_version_suffix, get_candidate_names, find_in_text
+
+
+class TestStripVersionSuffix:
+
+    def test_strips_trailing_version(self):
+        assert strip_version_suffix('GNU General Public License 3.0') == 'GNU General Public License'
+
+    def test_strips_version_with_v_prefix(self):
+        assert strip_version_suffix('Mozilla Public License v2.0') == 'Mozilla Public License'
+
+    def test_strips_or_later(self):
+        result = strip_version_suffix('GNU Lesser General Public License 2.1 or later')
+        assert result == 'GNU Lesser General Public License'
+
+    def test_returns_none_when_result_too_short(self):
+        assert strip_version_suffix('EPL 1.0') is None
+
+    def test_returns_none_when_no_version(self):
+        assert strip_version_suffix('Boost Software License') is None
+
+
+class FakeLicense:
+    """minimal mock for License objects"""
+    def __init__(self, key, name=None, short_name=None, spdx_license_key=None):
+        self.key = key
+        self.name = name
+        self.short_name = short_name
+        self.spdx_license_key = spdx_license_key
+
+
+class TestGetCandidateNames:
+
+    def test_collects_all_fields(self):
+        lic = FakeLicense(
+            key='apache-2.0',
+            name='Apache License 2.0',
+            short_name='Apache 2.0',
+            spdx_license_key='Apache-2.0',
+        )
+        names = get_candidate_names(lic)
+        assert 'Apache License 2.0' in names
+        assert 'Apache 2.0' in names
+        assert 'Apache-2.0' in names
+        assert 'apache-2.0' in names
+
+    def test_includes_extra_names_for_known_keys(self):
+        lic = FakeLicense(key='gpl-2.0', name='GNU General Public License 2.0')
+        names = get_candidate_names(lic)
+        assert 'GPLv2' in names
+        assert 'GPL 2.0' in names
+
+    def test_includes_version_stripped_base(self):
+        lic = FakeLicense(key='agpl-3.0', name='GNU Affero General Public License 3.0')
+        names = get_candidate_names(lic)
+        assert 'GNU Affero General Public License' in names
+
+    def test_sorted_longest_first(self):
+        lic = FakeLicense(key='mit', name='MIT License', short_name='MIT')
+        names = get_candidate_names(lic)
+        for i in range(len(names) - 1):
+            assert len(names[i]) >= len(names[i + 1])
+
+    def test_no_duplicates(self):
+        lic = FakeLicense(key='isc', name='ISC License', short_name='ISC License',
+                          spdx_license_key='ISC')
+        names = get_candidate_names(lic)
+        assert len(names) == len(set(names))
+
+
+class TestFindInText:
+
+    def test_basic_match(self):
+        text = 'Licensed under the Apache License, Version 2.0'
+        assert find_in_text(text, ['Apache License']) == 'Apache License'
+
+    def test_case_insensitive(self):
+        text = 'distributed under the gnu general public license'
+        result = find_in_text(text, ['GNU General Public License'])
+        assert result == 'gnu general public license'
+
+    def test_not_found(self):
+        text = 'this software uses the BSD license'
+        assert find_in_text(text, ['Apache License', 'MIT License']) is None
+
+    def test_skips_very_short_candidates(self):
+        assert find_in_text('released under XY terms', ['XY']) is None
+
+    def test_longer_candidate_matched_first(self):
+        text = 'Released under the MIT License'
+        assert find_in_text(text, ['MIT License', 'MIT']) == 'MIT License'