Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 165 additions & 0 deletions etc/scripts/dataset_pipeline/annotate_composites.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# annotates composite (AND/OR) license rules with {{ }} required phrase markers
# uses scancode's license index to find license names in rule text
import re
from pathlib import Path
import click
from license_expression import Licensing
from licensedcode.models import Rule, load_licenses
from licensedcode.models import rules_data_dir as default_rules_data_dir
from licensedcode.required_phrases import add_required_phrase_to_rule

MARKER_RE = re.compile(r'\{\{([^}]*)\}\}', re.DOTALL)
VERSION_SUFFIX_RE = re.compile(r'\s+v?\d[\d.]*(?:\s*(?:only|or[ -]later|\+))?$', re.IGNORECASE)

# short forms that appear in rule text but arent in the license index
EXTRA_NAMES = {
'gpl-2.0': ['GPL-2.0', 'GPLv2', 'GPL 2.0', 'GPL version 2', 'GPL Version 2'],
'gpl-2.0-plus': ['GPL-2.0+', 'GPLv2+', 'GPL 2.0 or later'],
'gpl-3.0': ['GPL-3.0', 'GPLv3', 'GPL 3.0', 'GPL version 3'],
'gpl-3.0-plus': ['GPL-3.0+', 'GPLv3+', 'GPL 3.0 or later'],
'lgpl-2.0-plus': ['LGPL-2.0', 'LGPL-2.0+', 'LGPL 2.0'],
'lgpl-2.1': ['LGPL-2.1', 'LGPLv2.1', 'LGPL 2.1'],
'lgpl-2.1-plus': ['LGPL-2.1+', 'LGPLv2.1+', 'LGPL 2.1 or later'],
'lgpl-3.0': ['LGPL-3.0', 'LGPLv3', 'LGPL 3.0'],
'lgpl-3.0-plus': ['LGPL-3.0+', 'LGPLv3+'],
'agpl-3.0': ['AGPL-3.0', 'AGPLv3', 'AGPL 3.0'],
'agpl-3.0-plus': ['AGPL-3.0+', 'AGPLv3+'],
'mpl-1.1': ['MPL-1.1', 'MPL 1.1'],
'mpl-2.0': ['MPL-2.0', 'MPL 2.0'],
'apache-2.0': ['Apache-2.0', 'Apache 2.0'],
'bsd-new': ['BSD-3-Clause', 'BSD 3-Clause', 'BSD-3-clause'],
'bsd-simplified': ['BSD-2-Clause', 'BSD 2-Clause', 'BSD 2-clause'],
'mit': ['MIT License', 'MIT license', 'MIT'],
'isc': ['ISC License', 'ISC license', 'ISC'],
'artistic-2.0': ['Artistic-2.0', 'Artistic 2.0'],
'epl-1.0': ['EPL-1.0', 'EPL 1.0'],
'epl-2.0': ['EPL-2.0', 'EPL 2.0'],
'cc-by-4.0': ['CC-BY-4.0', 'CC BY 4.0'],
'unlicense': ['Unlicense', 'UNLICENSE'],
}


def strip_version_suffix(name):
"""remove trailing version number from a license name"""
result = VERSION_SUFFIX_RE.sub('', name).strip()
if len(result) < 10 or result == name:
return None
return result


def get_candidate_names(lic):
"""collect names to search for, longest first"""
names = []
if lic.name:
names.append(lic.name)
base = strip_version_suffix(lic.name)
if base:
names.append(base)
if lic.short_name and lic.short_name not in names:
names.append(lic.short_name)
if lic.spdx_license_key and lic.spdx_license_key not in names:
names.append(lic.spdx_license_key)
if lic.key not in names:
names.append(lic.key)
for e in EXTRA_NAMES.get(lic.key, []):
if e not in names:
names.append(e)
names.sort(key=len, reverse=True)
return names


def find_in_text(text, candidates):
"""case insensitive search, returns the matched text in original case"""
text_lower = text.lower()
for name in candidates:
if not name or len(name) < 3:
continue
pos = text_lower.find(name.lower())
if pos != -1:
return text[pos:pos + len(name)]
return None


@click.command()
@click.option('--rules-dir', type=click.Path(exists=True), default=None)
@click.option('--expression-filter', default=None,
help='only process rules containing this in their expression')
@click.option('--limit', type=int, default=None)
@click.option('--dry-run', is_flag=True)
def main(rules_dir, expression_filter, limit, dry_run):
"""Annotate composite license rules with required phrase markers"""
if not rules_dir:
repo_rules = Path(__file__).resolve().parents[3] / 'src' / 'licensedcode' / 'data' / 'rules'
rules_dir = str(repo_rules) if repo_rules.is_dir() else default_rules_data_dir

rules_path = Path(rules_dir)
licenses_db = load_licenses()
licensing = Licensing(list(licenses_db.values()))

processed = 0
annotated = 0

for rf in sorted(rules_path.glob('*.RULE')):
if limit and processed >= limit:
break
stem = rf.stem
if '_or_' not in stem and '_and_' not in stem:
continue
try:
rule = Rule.from_file(rule_file=str(rf))
except Exception:
continue
if not rule.license_expression:
continue
if getattr(rule, 'is_required_phrase', False):
continue
if getattr(rule, 'skip_for_required_phrase_generation', False):
continue
if not getattr(rule, 'is_approx_matchable', True):
continue
text = rule.text or ''
if MARKER_RE.search(text):
continue
if expression_filter and expression_filter not in rule.license_expression:
continue

processed += 1
try:
keys = licensing.license_keys(rule.license_expression, unique=True)
except Exception:
continue
if not keys:
continue

# find each license name in the text
phrases = []
found_all = True
for key in keys:
lic = licenses_db.get(key)
if not lic:
found_all = False
break
match = find_in_text(text, get_candidate_names(lic))
if not match:
found_all = False
break
phrases.append(match)

if not found_all:
continue

added = False
for phrase in phrases:
if add_required_phrase_to_rule(rule=rule, required_phrase=phrase,
source='composite_annotation', dry_run=dry_run):
added = True
if added:
annotated += 1
if dry_run:
click.echo(f' {rule.identifier}: {phrases}')

click.echo(f'\ndone - {annotated}/{processed} annotated')


if __name__ == '__main__':
main()
96 changes: 96 additions & 0 deletions etc/scripts/dataset_pipeline/test_annotate_composites.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# tests for annotate_composites.py
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent))
from annotate_composites import strip_version_suffix, get_candidate_names, find_in_text


class TestStripVersionSuffix:

def test_strips_trailing_version(self):
assert strip_version_suffix('GNU General Public License 3.0') == 'GNU General Public License'

def test_strips_version_with_v_prefix(self):
assert strip_version_suffix('Mozilla Public License v2.0') == 'Mozilla Public License'

def test_strips_or_later(self):
result = strip_version_suffix('GNU Lesser General Public License 2.1 or later')
assert result == 'GNU Lesser General Public License'

def test_returns_none_when_result_too_short(self):
assert strip_version_suffix('EPL 1.0') is None

def test_returns_none_when_no_version(self):
assert strip_version_suffix('Boost Software License') is None


class FakeLicense:
"""minimal mock for License objects"""
def __init__(self, key, name=None, short_name=None, spdx_license_key=None):
self.key = key
self.name = name
self.short_name = short_name
self.spdx_license_key = spdx_license_key


class TestGetCandidateNames:

def test_collects_all_fields(self):
lic = FakeLicense(
key='apache-2.0',
name='Apache License 2.0',
short_name='Apache 2.0',
spdx_license_key='Apache-2.0',
)
names = get_candidate_names(lic)
assert 'Apache License 2.0' in names
assert 'Apache 2.0' in names
assert 'Apache-2.0' in names
assert 'apache-2.0' in names

def test_includes_extra_names_for_known_keys(self):
lic = FakeLicense(key='gpl-2.0', name='GNU General Public License 2.0')
names = get_candidate_names(lic)
assert 'GPLv2' in names
assert 'GPL 2.0' in names

def test_includes_version_stripped_base(self):
lic = FakeLicense(key='agpl-3.0', name='GNU Affero General Public License 3.0')
names = get_candidate_names(lic)
assert 'GNU Affero General Public License' in names

def test_sorted_longest_first(self):
lic = FakeLicense(key='mit', name='MIT License', short_name='MIT')
names = get_candidate_names(lic)
for i in range(len(names) - 1):
assert len(names[i]) >= len(names[i + 1])

def test_no_duplicates(self):
lic = FakeLicense(key='isc', name='ISC License', short_name='ISC License',
spdx_license_key='ISC')
names = get_candidate_names(lic)
assert len(names) == len(set(names))


class TestFindInText:

def test_basic_match(self):
text = 'Licensed under the Apache License, Version 2.0'
assert find_in_text(text, ['Apache License']) == 'Apache License'

def test_case_insensitive(self):
text = 'distributed under the gnu general public license'
result = find_in_text(text, ['GNU General Public License'])
assert result == 'gnu general public license'

def test_not_found(self):
text = 'this software uses the BSD license'
assert find_in_text(text, ['Apache License', 'MIT License']) is None

def test_skips_very_short_candidates(self):
assert find_in_text('released under XY terms', ['XY']) is None

def test_longer_candidate_matched_first(self):
text = 'Released under the MIT License'
assert find_in_text(text, ['MIT License', 'MIT']) == 'MIT License'
Loading