From c26d3f9ed031276df9130ea9e275ae313d004f1f Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 10 Mar 2026 11:50:55 +0000 Subject: [PATCH 01/16] chore: install ripgrep Signed-off-by: Mouad BANI --- scripts/services/docker/Dockerfile.git_integration | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/services/docker/Dockerfile.git_integration b/scripts/services/docker/Dockerfile.git_integration index 84895879d1..4c9c371007 100644 --- a/scripts/services/docker/Dockerfile.git_integration +++ b/scripts/services/docker/Dockerfile.git_integration @@ -82,6 +82,7 @@ FROM base AS runner RUN apt-get update && apt-get install -y \ ca-certificates \ git \ + ripgrep \ --no-install-recommends \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean \ From e1f8dad3623d5392934d889cfab654e7dd997295 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 10 Mar 2026 12:47:26 +0000 Subject: [PATCH 02/16] feat: leverage maintainersFile from db before falling back to regular detection Signed-off-by: Mouad BANI --- .../services/maintainer/maintainer_service.py | 94 +++++++++++++++---- 1 file changed, 78 insertions(+), 16 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index 1734dd75e6..41a58d6d80 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -397,9 +397,76 @@ async def find_maintainer_file(self, repo_path: str, owner: str, repo: str): return None, None, ai_cost - async def extract_maintainers(self, repo_path: str, owner: str, repo: str): + async def analyze_and_build_result(self, filename: str, content: str) -> MaintainerResult: + """ + Analyze file content with AI and return a MaintainerResult. + Raises MaintanerAnalysisError if no maintainers are found. + """ + self.logger.info(f"Analyzing maintainer file: {filename}") + result = await self.analyze_file_content(filename, content) + + if not result.output.info: + raise MaintanerAnalysisError(ai_cost=result.cost) + + return MaintainerResult( + maintainer_file=filename, + maintainer_info=result.output.info, + total_cost=result.cost, + ) + + async def try_saved_maintainer_file( + self, repo_path: str, saved_maintainer_file: str + ) -> tuple[MaintainerResult | None, float]: + """ + Attempt to read and analyze the previously saved maintainer file. + Returns (result, cost) where result is None if the attempt failed. + """ + cost = 0.0 + file_path = os.path.join(repo_path, saved_maintainer_file) + + if not await aiofiles.os.path.isfile(file_path): + self.logger.warning( + f"Saved maintainer file '{saved_maintainer_file}' no longer exists on disk" + ) + return None, cost + + try: + async with aiofiles.open(file_path, "r", encoding="utf-8") as f: + content = await f.read() + + result = await self.analyze_and_build_result(saved_maintainer_file, content) + cost += result.total_cost + return result, cost + except MaintanerAnalysisError as e: + cost += e.ai_cost + self.logger.warning( + f"Saved maintainer file '{saved_maintainer_file}' analysis failed: {e.error_message}" + ) + return None, cost + except Exception as e: + self.logger.warning( + f"Saved maintainer file '{saved_maintainer_file}' processing failed: {repr(e)}" + ) + return None, cost + + async def extract_maintainers( + self, + repo_path: str, + owner: str, + repo: str, + saved_maintainer_file: str | None = None, + ): total_cost = 0 + if saved_maintainer_file: + self.logger.info(f"Trying saved maintainer file: {saved_maintainer_file}") + result, cost = await self.try_saved_maintainer_file(repo_path, saved_maintainer_file) + total_cost += cost + if result: + result.total_cost = total_cost + return result + self.logger.info("Falling back to maintainer file detection") + self.logger.info("Looking for maintainer file...") maintainer_file, file_content, cost = await self.find_maintainer_file( repo_path, owner, repo @@ -411,21 +478,11 @@ async def extract_maintainers(self, repo_path: str, owner: str, repo: str): raise MaintainerFileNotFoundError(ai_cost=total_cost) decoded_content = base64.b64decode(file_content).decode("utf-8") + result = await self.analyze_and_build_result(maintainer_file, decoded_content) + total_cost += result.total_cost - self.logger.info(f"Analyzing maintainer file: {maintainer_file}") - result = await self.analyze_file_content(maintainer_file, decoded_content) - maintainer_info = result.output.info - total_cost += result.cost - - if not maintainer_info: - self.logger.error("Failed to analyze the maintainer file content.") - raise MaintanerAnalysisError(ai_cost=total_cost) - - return MaintainerResult( - maintainer_file=maintainer_file, - maintainer_info=maintainer_info, - total_cost=total_cost, - ) + result.total_cost = total_cost + return result async def check_if_interval_elapsed(self, repository: Repository) -> tuple[bool, float]: """ @@ -514,7 +571,12 @@ async def process_maintainers( ) self.logger.info(f"Starting maintainers processing for repo: {batch_info.remote}") - maintainers = await self.extract_maintainers(batch_info.repo_path, owner, repo_name) + maintainers = await self.extract_maintainers( + batch_info.repo_path, + owner, + repo_name, + saved_maintainer_file=repository.maintainer_file, + ) latest_maintainer_file = maintainers.maintainer_file ai_cost = maintainers.total_cost maintainers_found = len(maintainers.maintainer_info) From e31275bcaa3f457b7edca682fef92a235dc427bd Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 10 Mar 2026 15:31:06 +0000 Subject: [PATCH 03/16] feat: improve maintainers detection & analysis Signed-off-by: Mouad BANI --- .../services/maintainer/maintainer_service.py | 244 ++++++++++++++---- 1 file changed, 192 insertions(+), 52 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index 41a58d6d80..da0f995950 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -1,5 +1,4 @@ import asyncio -import base64 import os import time as time_module from datetime import datetime, time, timezone @@ -20,6 +19,7 @@ ) from crowdgit.enums import ErrorCode, ExecutionStatus, OperationType from crowdgit.errors import ( + CommandExecutionError, CrowdGitError, MaintainerFileNotFoundError, MaintainerIntervalNotElapsedError, @@ -37,7 +37,7 @@ from crowdgit.models.service_execution import ServiceExecution from crowdgit.services.base.base_service import BaseService from crowdgit.services.maintainer.bedrock import invoke_bedrock -from crowdgit.services.utils import parse_repo_url +from crowdgit.services.utils import run_shell_command from crowdgit.settings import MAINTAINER_RETRY_INTERVAL_DAYS, MAINTAINER_UPDATE_INTERVAL_HOURS @@ -46,24 +46,60 @@ class MaintainerService(BaseService): MAX_CHUNK_SIZE = 5000 MAX_CONCURRENT_CHUNKS = 3 # Maximum concurrent chunk processing + MAX_AI_ANALYSIS_ATTEMPTS = 3 + # List of common maintainer file names MAINTAINER_FILES = [ "MAINTAINERS", "MAINTAINERS.md", "MAINTAINER.md", + "CODEOWNERS", "CODEOWNERS.md", "CONTRIBUTORS", "CONTRIBUTORS.md", - "docs/MAINTAINERS.md", "OWNERS", - "CODEOWNERS", + "OWNERS.md", + "AUTHORS", + "AUTHORS.md", + "docs/MAINTAINERS.md", ".github/MAINTAINERS.md", ".github/CONTRIBUTORS.md", + ".github/CODEOWNERS", "GOVERNANCE.md", - "README.md", - "SECURITY-INSIGHTS.md", ] + VALID_EXTENSIONS = { + "", + ".md", + ".markdown", + ".txt", + ".rst", + ".yaml", + ".yml", + ".toml", + ".adoc", + ".csv", + } + + CONTENT_VALIDATION_KEYWORDS = [ + "maintainer", + "codeowner", + "owner", + "contributor", + "author", + "reviewer", + "governance", + "lead", + "approver", + "committer", + "credit", + "administrator", + "steward", + "emeritus", + ] + + EXCLUDED_FILENAMES = {"contributing.md", "contributing"} + def make_role(self, title: str): title = title.lower() title = ( @@ -358,44 +394,122 @@ async def find_maintainer_file_with_ai(self, file_names): else: return None, result.cost - async def find_maintainer_file(self, repo_path: str, owner: str, repo: str): - self.logger.info(f"Looking for maintainer files in {owner}/{repo}...") - - file_names = await aiofiles.os.listdir(repo_path) - - for file in self.MAINTAINER_FILES: - file_path = os.path.join(repo_path, file) - if await aiofiles.os.path.isfile(file_path): - self.logger.info(f"maintainer file: {file_path} found in repo") - async with aiofiles.open(file_path, "r", encoding="utf-8") as f: - content = await f.read() + async def _list_repo_files(self, repo_path: str) -> list[str]: + """List all files in the repo recursively, respecting .gitignore via rg.""" + try: + output = await run_shell_command( + ["rg", "--files", "--hidden", "--glob", "!.git/", "."], cwd=repo_path + ) + return [ + line[2:] if line.startswith("./") else line + for line in output.strip().split("\n") + if line.strip() + ] + except Exception as e: + self.logger.warning(f"rg --files failed, falling back to os.walk: {repr(e)}") + results = [] + for dirpath, dirnames, filenames in os.walk(repo_path): + dirnames[:] = [d for d in dirnames if d != ".git"] + for filename in filenames: + full_path = os.path.join(dirpath, filename) + results.append(os.path.relpath(full_path, repo_path)) + return results + + async def _ripgrep_search(self, repo_path: str) -> list[str]: + """Search for files containing maintainer-related keywords, filtered to valid extensions.""" + pattern = "|".join(self.CONTENT_VALIDATION_KEYWORDS) + + exclusion_globs = ["--glob", "!.git/"] + for name in self.EXCLUDED_FILENAMES: + exclusion_globs.extend(["--iglob", f"!{name}"]) - if file.lower() == "readme.md" and "maintainer" not in content.lower(): - self.logger.info(f"Skipping {file}: no maintainer-related content found") - continue + try: + output = await run_shell_command( + ["rg", "-l", "-i", "--hidden", pattern, *exclusion_globs, "."], cwd=repo_path + ) + except CommandExecutionError: + self.logger.info("Ripgrep found no files containing maintainer keywords") + return [] + except Exception as e: + self.logger.warning(f"Ripgrep search failed: {repr(e)}") + return [] - return file, base64.b64encode(content.encode()).decode(), 0 + results = [] + for line in output.strip().split("\n"): + line = line.strip() + if not line: + continue + if line.startswith("./"): + line = line[2:] + basename = os.path.basename(line).lower() + ext = os.path.splitext(basename)[1] + if ext not in self.VALID_EXTENSIONS: + self.logger.debug(f"Skipping '{line}': extension '{ext}' not in valid extensions") + continue + if ext == "" and not any(kw in basename for kw in self.CONTENT_VALIDATION_KEYWORDS): + self.logger.debug( + f"Skipping extensionless file '{line}': " + f"basename '{basename}' contains no governance keyword" + ) + continue + results.append(line) - self.logger.warning("No maintainer files found using the known file names.") + self.logger.info(f"Ripgrep found {len(results)} candidate files after filtering") + return results - file_name, ai_cost = await self.find_maintainer_file_with_ai(file_names) + async def find_candidate_files(self, repo_path: str) -> list[tuple[str, str]]: + """ + Find all potential maintainer files using static list + dynamic ripgrep search. + Returns ordered list of (relative_path, content) tuples. + Static matches come first, then dynamic matches sorted by content keyword score. + """ + candidates_static = [] + static_paths_lower = set() - if file_name: - file_path = os.path.join(repo_path, file_name) + for file in self.MAINTAINER_FILES: + file_path = os.path.join(repo_path, file) if await aiofiles.os.path.isfile(file_path): + try: + async with aiofiles.open(file_path, "r", encoding="utf-8") as f: + content = await f.read() + candidates_static.append((file, content)) + static_paths_lower.add(file.lower()) + self.logger.info(f"Static match found: {file}") + except Exception as e: + self.logger.warning(f"Failed to read static match {file}: {repr(e)}") + + dynamic_paths = await self._ripgrep_search(repo_path) + + scored_dynamic = [] + for candidate_path in dynamic_paths: + if candidate_path.lower() in static_paths_lower: + continue + + file_path = os.path.join(repo_path, candidate_path) + try: async with aiofiles.open(file_path, "r", encoding="utf-8") as f: content = await f.read() + except Exception as e: + self.logger.warning(f"Failed to read dynamic match {candidate_path}: {repr(e)}") + continue - if file_name.lower() == "readme.md" and "maintainer" not in content.lower(): - self.logger.info( - f"AI suggested {file_name}, but it has no maintainer-related content. Skipping." - ) - return None, None, ai_cost + content_lower = content.lower() + # Calculate score based on keywords matched in the content + score = sum(1 for kw in self.CONTENT_VALIDATION_KEYWORDS if kw in content_lower) + if score > 0: + scored_dynamic.append((candidate_path, content, score)) + self.logger.info( + f"Dynamic match validated: {candidate_path} (keyword score: {score})" + ) - self.logger.info(f"\nMaintainer file found: {file_name}") - return file_name, base64.b64encode(content.encode()).decode(), ai_cost + # Sort by score DESC + scored_dynamic.sort(key=lambda c: c[2], reverse=True) - return None, None, ai_cost + result = candidates_static + [(path, content) for path, content, _ in scored_dynamic] + self.logger.info( + f"Found {len(candidates_static)} static and {len(scored_dynamic)} dynamic candidates" + ) + return result async def analyze_and_build_result(self, filename: str, content: str) -> MaintainerResult: """ @@ -452,12 +566,11 @@ async def try_saved_maintainer_file( async def extract_maintainers( self, repo_path: str, - owner: str, - repo: str, saved_maintainer_file: str | None = None, ): total_cost = 0 + # Step 1: Try the previously saved maintainer file if saved_maintainer_file: self.logger.info(f"Trying saved maintainer file: {saved_maintainer_file}") result, cost = await self.try_saved_maintainer_file(repo_path, saved_maintainer_file) @@ -467,22 +580,53 @@ async def extract_maintainers( return result self.logger.info("Falling back to maintainer file detection") - self.logger.info("Looking for maintainer file...") - maintainer_file, file_content, cost = await self.find_maintainer_file( - repo_path, owner, repo - ) - total_cost += cost + # Step 2: Find candidates via static list + ripgrep dynamic search + candidates = await self.find_candidate_files(repo_path) + + # Step 3: Try AI analysis on candidates, stop on first success + if candidates: + attempts = min(len(candidates), self.MAX_AI_ANALYSIS_ATTEMPTS) + for filename, content in candidates[:attempts]: + try: + result = await self.analyze_and_build_result(filename, content) + total_cost += result.total_cost + result.total_cost = total_cost + return result + except MaintanerAnalysisError as e: + total_cost += e.ai_cost + self.logger.warning(f"AI analysis failed for '{filename}': {e.error_message}") + except Exception as e: + self.logger.warning(f"Unexpected error analyzing '{filename}': {repr(e)}") + + self.logger.warning( + f"AI analysis failed for all {attempts} candidate(s), trying AI file detection" + ) + else: + self.logger.warning("No candidate files found via search, trying AI file detection") - if not maintainer_file or not file_content: - self.logger.error("No maintainer file found") - raise MaintainerFileNotFoundError(ai_cost=total_cost) + # Step 4: AI file detection as last resort + file_names = await self._list_repo_files(repo_path) + ai_file_name, ai_cost = await self.find_maintainer_file_with_ai(file_names) + total_cost += ai_cost - decoded_content = base64.b64decode(file_content).decode("utf-8") - result = await self.analyze_and_build_result(maintainer_file, decoded_content) - total_cost += result.total_cost + if ai_file_name: + file_path = os.path.join(repo_path, ai_file_name) + if await aiofiles.os.path.isfile(file_path): + try: + async with aiofiles.open(file_path, "r", encoding="utf-8") as f: + content = await f.read() + result = await self.analyze_and_build_result(ai_file_name, content) + total_cost += result.total_cost + result.total_cost = total_cost + return result + except MaintanerAnalysisError as e: + total_cost += e.ai_cost + self.logger.warning( + f"AI-suggested file '{ai_file_name}' analysis failed: {e.error_message}" + ) - result.total_cost = total_cost - return result + self.logger.error("No maintainer file found") + raise MaintainerFileNotFoundError(ai_cost=total_cost) async def check_if_interval_elapsed(self, repository: Repository) -> tuple[bool, float]: """ @@ -560,8 +704,6 @@ async def process_maintainers( maintainers_skipped = 0 try: - owner, repo_name = parse_repo_url(batch_info.remote) - has_interval_elapsed, remaining_hours = await self.check_if_interval_elapsed( repository ) @@ -573,8 +715,6 @@ async def process_maintainers( self.logger.info(f"Starting maintainers processing for repo: {batch_info.remote}") maintainers = await self.extract_maintainers( batch_info.repo_path, - owner, - repo_name, saved_maintainer_file=repository.maintainer_file, ) latest_maintainer_file = maintainers.maintainer_file From 67ace7981f92f8ac5c9bf893ed2f953dd3aaeda0 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 10 Mar 2026 15:39:09 +0000 Subject: [PATCH 04/16] feat: track analyzed maintainers files in metrics Signed-off-by: Mouad BANI --- .../src/crowdgit/models/maintainer_info.py | 2 ++ .../services/maintainer/maintainer_service.py | 25 ++++++++++++++----- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/models/maintainer_info.py b/services/apps/git_integration/src/crowdgit/models/maintainer_info.py index 5a420567ae..6914059a2b 100644 --- a/services/apps/git_integration/src/crowdgit/models/maintainer_info.py +++ b/services/apps/git_integration/src/crowdgit/models/maintainer_info.py @@ -34,3 +34,5 @@ class MaintainerResult(BaseModel): maintainer_file: str | None = None maintainer_info: list[MaintainerInfoItem] | None = None total_cost: float = 0 + candidate_files: list[str] = [] + ai_suggested_file: str | None = None diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index da0f995950..827ad047c0 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -569,6 +569,14 @@ async def extract_maintainers( saved_maintainer_file: str | None = None, ): total_cost = 0 + candidate_files: list[str] = [] + ai_suggested_file: str | None = None + + def _attach_metadata(result: MaintainerResult) -> MaintainerResult: + result.total_cost = total_cost + result.candidate_files = candidate_files + result.ai_suggested_file = ai_suggested_file + return result # Step 1: Try the previously saved maintainer file if saved_maintainer_file: @@ -576,12 +584,12 @@ async def extract_maintainers( result, cost = await self.try_saved_maintainer_file(repo_path, saved_maintainer_file) total_cost += cost if result: - result.total_cost = total_cost - return result + return _attach_metadata(result) self.logger.info("Falling back to maintainer file detection") # Step 2: Find candidates via static list + ripgrep dynamic search candidates = await self.find_candidate_files(repo_path) + candidate_files = [path for path, _ in candidates] # Step 3: Try AI analysis on candidates, stop on first success if candidates: @@ -590,8 +598,7 @@ async def extract_maintainers( try: result = await self.analyze_and_build_result(filename, content) total_cost += result.total_cost - result.total_cost = total_cost - return result + return _attach_metadata(result) except MaintanerAnalysisError as e: total_cost += e.ai_cost self.logger.warning(f"AI analysis failed for '{filename}': {e.error_message}") @@ -607,6 +614,7 @@ async def extract_maintainers( # Step 4: AI file detection as last resort file_names = await self._list_repo_files(repo_path) ai_file_name, ai_cost = await self.find_maintainer_file_with_ai(file_names) + ai_suggested_file = ai_file_name total_cost += ai_cost if ai_file_name: @@ -617,8 +625,7 @@ async def extract_maintainers( content = await f.read() result = await self.analyze_and_build_result(ai_file_name, content) total_cost += result.total_cost - result.total_cost = total_cost - return result + return _attach_metadata(result) except MaintanerAnalysisError as e: total_cost += e.ai_cost self.logger.warning( @@ -702,6 +709,8 @@ async def process_maintainers( ai_cost = 0.0 maintainers_found = 0 maintainers_skipped = 0 + candidate_files: list[str] = [] + ai_suggested_file: str | None = None try: has_interval_elapsed, remaining_hours = await self.check_if_interval_elapsed( @@ -720,6 +729,8 @@ async def process_maintainers( latest_maintainer_file = maintainers.maintainer_file ai_cost = maintainers.total_cost maintainers_found = len(maintainers.maintainer_info) + candidate_files = maintainers.candidate_files + ai_suggested_file = maintainers.ai_suggested_file if repository.parent_repo: filtered_maintainers = await self.exclude_parent_repo_maintainers( @@ -774,6 +785,8 @@ async def process_maintainers( "ai_cost": ai_cost, "maintainers_found": maintainers_found, "maintainers_skipped": maintainers_skipped, + "candidate_files": candidate_files, + "ai_suggested_file": ai_suggested_file, }, ) await save_service_execution(service_execution) From 9018e808053d218abf778d8d068aec632757cbda Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 11 Mar 2026 13:32:20 +0000 Subject: [PATCH 05/16] feat: change candidate file detection to be more narrow Signed-off-by: Mouad BANI --- .../src/crowdgit/models/maintainer_info.py | 2 +- .../services/maintainer/maintainer_service.py | 272 ++++++++++-------- 2 files changed, 146 insertions(+), 128 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/models/maintainer_info.py b/services/apps/git_integration/src/crowdgit/models/maintainer_info.py index 6914059a2b..1752999e54 100644 --- a/services/apps/git_integration/src/crowdgit/models/maintainer_info.py +++ b/services/apps/git_integration/src/crowdgit/models/maintainer_info.py @@ -34,5 +34,5 @@ class MaintainerResult(BaseModel): maintainer_file: str | None = None maintainer_info: list[MaintainerInfoItem] | None = None total_cost: float = 0 - candidate_files: list[str] = [] + candidate_files: list[tuple[str, int]] = [] ai_suggested_file: str | None = None diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index 827ad047c0..88d5e5ec57 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -45,28 +45,51 @@ class MaintainerService(BaseService): """Service for processing maintainer data""" MAX_CHUNK_SIZE = 5000 - MAX_CONCURRENT_CHUNKS = 3 # Maximum concurrent chunk processing - MAX_AI_ANALYSIS_ATTEMPTS = 3 - - # List of common maintainer file names - MAINTAINER_FILES = [ - "MAINTAINERS", - "MAINTAINERS.md", - "MAINTAINER.md", - "CODEOWNERS", - "CODEOWNERS.md", - "CONTRIBUTORS", - "CONTRIBUTORS.md", - "OWNERS", - "OWNERS.md", - "AUTHORS", - "AUTHORS.md", - "docs/MAINTAINERS.md", - ".github/MAINTAINERS.md", - ".github/CONTRIBUTORS.md", - ".github/CODEOWNERS", - "GOVERNANCE.md", - ] + MAX_CONCURRENT_CHUNKS = 3 + + # Full paths that get the highest score bonus when matched exactly + KNOWN_PATHS = { + "maintainers", + "maintainers.md", + "maintainer.md", + "codeowners", + "codeowners.md", + "contributors", + "contributors.md", + "owners", + "owners.md", + "authors", + "authors.md", + "governance.md", + "docs/maintainers.md", + ".github/maintainers.md", + ".github/contributors.md", + ".github/codeowners", + } + + # Governance stems (basename without extension, lowercased) for filename search + GOVERNANCE_STEMS = { + "maintainers", + "maintainer", + "codeowners", + "codeowner", + "contributors", + "contributor", + "owners", + "owners_aliases", + "authors", + "committers", + "commiters", + "reviewers", + "approvers", + "administrators", + "stewards", + "credits", + "governance", + "core_team", + "code_owners", + "emeritus", + } VALID_EXTENSIONS = { "", @@ -79,26 +102,31 @@ class MaintainerService(BaseService): ".toml", ".adoc", ".csv", + ".rdoc", } - CONTENT_VALIDATION_KEYWORDS = [ + SCORING_KEYWORDS = [ "maintainer", "codeowner", "owner", "contributor", - "author", - "reviewer", "governance", - "lead", - "approver", - "committer", - "credit", - "administrator", "steward", "emeritus", + "approver", + "reviewer", ] - EXCLUDED_FILENAMES = {"contributing.md", "contributing"} + EXCLUDED_FILENAMES = { + "contributing.md", + "contributing", + "code_of_conduct.md", + "code-of-conduct.md", + } + + FULL_PATH_SCORE = 100 + STEM_MATCH_SCORE = 50 + PARTIAL_STEM_SCORE = 25 def make_role(self, title: str): title = title.lower() @@ -385,7 +413,7 @@ def get_maintainer_file_prompt(self, example_files: list[str], file_names: list[ async def find_maintainer_file_with_ai(self, file_names): self.logger.info("Using AI to find maintainer files...") - prompt = self.get_maintainer_file_prompt(self.MAINTAINER_FILES, file_names) + prompt = self.get_maintainer_file_prompt(sorted(self.KNOWN_PATHS), file_names) result = await invoke_bedrock(prompt, pydantic_model=MaintainerFile) if result.output.file_name is not None: @@ -395,40 +423,39 @@ async def find_maintainer_file_with_ai(self, file_names): return None, result.cost async def _list_repo_files(self, repo_path: str) -> list[str]: - """List all files in the repo recursively, respecting .gitignore via rg.""" - try: - output = await run_shell_command( - ["rg", "--files", "--hidden", "--glob", "!.git/", "."], cwd=repo_path - ) - return [ - line[2:] if line.startswith("./") else line - for line in output.strip().split("\n") - if line.strip() - ] - except Exception as e: - self.logger.warning(f"rg --files failed, falling back to os.walk: {repr(e)}") - results = [] - for dirpath, dirnames, filenames in os.walk(repo_path): - dirnames[:] = [d for d in dirnames if d != ".git"] - for filename in filenames: - full_path = os.path.join(dirpath, filename) - results.append(os.path.relpath(full_path, repo_path)) - return results + """List non-code files in the repo recursively, filtered by VALID_EXTENSIONS.""" + glob_args = ["--glob", "!.git/"] + for ext in self.VALID_EXTENSIONS: + glob_args.extend(["--iglob", f"*{ext}"]) - async def _ripgrep_search(self, repo_path: str) -> list[str]: - """Search for files containing maintainer-related keywords, filtered to valid extensions.""" - pattern = "|".join(self.CONTENT_VALIDATION_KEYWORDS) + output = await run_shell_command( + ["rg", "--files", "--hidden", *glob_args, "."], cwd=repo_path + ) + return [ + line[2:] if line.startswith("./") else line + for line in output.strip().split("\n") + if line.strip() + ] - exclusion_globs = ["--glob", "!.git/"] - for name in self.EXCLUDED_FILENAMES: - exclusion_globs.extend(["--iglob", f"!{name}"]) + async def _ripgrep_search(self, repo_path: str) -> list[str]: + """Search for files whose basename matches a governance stem, at any depth.""" + glob_args = ["--glob", "!.git/"] + for stem in self.GOVERNANCE_STEMS: + glob_args.extend( + [ + "--iglob", + f"*{stem}*", + "--iglob", + f"*{stem}*.*", + ] + ) try: output = await run_shell_command( - ["rg", "-l", "-i", "--hidden", pattern, *exclusion_globs, "."], cwd=repo_path + ["rg", "--files", "--hidden", *glob_args, "."], cwd=repo_path ) except CommandExecutionError: - self.logger.info("Ripgrep found no files containing maintainer keywords") + self.logger.info("Ripgrep found no governance files by filename") return [] except Exception as e: self.logger.warning(f"Ripgrep search failed: {repr(e)}") @@ -442,74 +469,64 @@ async def _ripgrep_search(self, repo_path: str) -> list[str]: if line.startswith("./"): line = line[2:] basename = os.path.basename(line).lower() + if basename in self.EXCLUDED_FILENAMES: + continue ext = os.path.splitext(basename)[1] if ext not in self.VALID_EXTENSIONS: - self.logger.debug(f"Skipping '{line}': extension '{ext}' not in valid extensions") - continue - if ext == "" and not any(kw in basename for kw in self.CONTENT_VALIDATION_KEYWORDS): - self.logger.debug( - f"Skipping extensionless file '{line}': " - f"basename '{basename}' contains no governance keyword" - ) continue results.append(line) - self.logger.info(f"Ripgrep found {len(results)} candidate files after filtering") + self.logger.info(f"Ripgrep found {len(results)} governance files by filename") return results - async def find_candidate_files(self, repo_path: str) -> list[tuple[str, str]]: + def _score_filename(self, candidate_path: str) -> int: + """Score by how closely the filename matches known governance patterns.""" + path = candidate_path.lower() + if path in self.KNOWN_PATHS: + return self.FULL_PATH_SCORE + stem = os.path.splitext(os.path.basename(path))[0].lstrip(".") + if stem in self.GOVERNANCE_STEMS: + return self.STEM_MATCH_SCORE + if any(known_stem in stem for known_stem in self.GOVERNANCE_STEMS): + return self.PARTIAL_STEM_SCORE + return 0 + + async def find_candidate_files(self, repo_path: str) -> list[tuple[str, str, int]]: """ - Find all potential maintainer files using static list + dynamic ripgrep search. - Returns ordered list of (relative_path, content) tuples. - Static matches come first, then dynamic matches sorted by content keyword score. + Find governance files by filename, score them, and return all candidates sorted by score. + Scoring: full known-path match (100) > exact stem (50) > partial stem (25) + content keywords (+1 each). """ - candidates_static = [] - static_paths_lower = set() - - for file in self.MAINTAINER_FILES: - file_path = os.path.join(repo_path, file) - if await aiofiles.os.path.isfile(file_path): - try: - async with aiofiles.open(file_path, "r", encoding="utf-8") as f: - content = await f.read() - candidates_static.append((file, content)) - static_paths_lower.add(file.lower()) - self.logger.info(f"Static match found: {file}") - except Exception as e: - self.logger.warning(f"Failed to read static match {file}: {repr(e)}") - - dynamic_paths = await self._ripgrep_search(repo_path) - - scored_dynamic = [] - for candidate_path in dynamic_paths: - if candidate_path.lower() in static_paths_lower: - continue + found_paths = await self._ripgrep_search(repo_path) + if not found_paths: + return [] + scored = [] + for candidate_path in found_paths: file_path = os.path.join(repo_path, candidate_path) try: async with aiofiles.open(file_path, "r", encoding="utf-8") as f: content = await f.read() except Exception as e: - self.logger.warning(f"Failed to read dynamic match {candidate_path}: {repr(e)}") + self.logger.warning(f"Failed to read candidate {candidate_path}: {repr(e)}") continue - content_lower = content.lower() - # Calculate score based on keywords matched in the content - score = sum(1 for kw in self.CONTENT_VALIDATION_KEYWORDS if kw in content_lower) - if score > 0: - scored_dynamic.append((candidate_path, content, score)) - self.logger.info( - f"Dynamic match validated: {candidate_path} (keyword score: {score})" - ) + filename_score = self._score_filename(candidate_path) + content_score = sum(1 for kw in self.SCORING_KEYWORDS if kw in content.lower()) + total = filename_score + content_score - # Sort by score DESC - scored_dynamic.sort(key=lambda c: c[2], reverse=True) + scored.append((candidate_path, content, total)) + self.logger.info( + f"Candidate: {candidate_path} " + f"(filename: {filename_score}, content: {content_score}, total: {total})" + ) - result = candidates_static + [(path, content) for path, content, _ in scored_dynamic] - self.logger.info( - f"Found {len(candidates_static)} static and {len(scored_dynamic)} dynamic candidates" - ) - return result + scored.sort(key=lambda c: c[2], reverse=True) + + if scored: + self.logger.info(f"Top candidate: {scored[0][0]} (from {len(scored)} total)") + else: + self.logger.info("No valid candidates after scoring") + return scored async def analyze_and_build_result(self, filename: str, content: str) -> MaintainerResult: """ @@ -569,7 +586,7 @@ async def extract_maintainers( saved_maintainer_file: str | None = None, ): total_cost = 0 - candidate_files: list[str] = [] + candidate_files: list[tuple[str, int]] = [] ai_suggested_file: str | None = None def _attach_metadata(result: MaintainerResult) -> MaintainerResult: @@ -587,27 +604,24 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult: return _attach_metadata(result) self.logger.info("Falling back to maintainer file detection") - # Step 2: Find candidates via static list + ripgrep dynamic search + # Step 2: Find top candidate via filename search + scoring candidates = await self.find_candidate_files(repo_path) - candidate_files = [path for path, _ in candidates] + candidate_files = [(path, score) for path, _, score in candidates] - # Step 3: Try AI analysis on candidates, stop on first success + # Step 3: Try AI analysis on top candidate if candidates: - attempts = min(len(candidates), self.MAX_AI_ANALYSIS_ATTEMPTS) - for filename, content in candidates[:attempts]: - try: - result = await self.analyze_and_build_result(filename, content) - total_cost += result.total_cost - return _attach_metadata(result) - except MaintanerAnalysisError as e: - total_cost += e.ai_cost - self.logger.warning(f"AI analysis failed for '{filename}': {e.error_message}") - except Exception as e: - self.logger.warning(f"Unexpected error analyzing '{filename}': {repr(e)}") + filename, content, _ = candidates[0] + try: + result = await self.analyze_and_build_result(filename, content) + total_cost += result.total_cost + return _attach_metadata(result) + except MaintanerAnalysisError as e: + total_cost += e.ai_cost + self.logger.warning(f"AI analysis failed for '{filename}': {e.error_message}") + except Exception as e: + self.logger.warning(f"Unexpected error analyzing '{filename}': {repr(e)}") - self.logger.warning( - f"AI analysis failed for all {attempts} candidate(s), trying AI file detection" - ) + self.logger.warning("Top candidate failed, trying AI file detection") else: self.logger.warning("No candidate files found via search, trying AI file detection") @@ -619,7 +633,11 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult: if ai_file_name: file_path = os.path.join(repo_path, ai_file_name) - if await aiofiles.os.path.isfile(file_path): + if not await aiofiles.os.path.isfile(file_path): + self.logger.warning( + f"AI suggested '{ai_file_name}' but file does not exist on disk" + ) + else: try: async with aiofiles.open(file_path, "r", encoding="utf-8") as f: content = await f.read() From ae33af60a883bfec1f729207ce0e82691cf50dde Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 11 Mar 2026 13:37:33 +0000 Subject: [PATCH 06/16] fix: enable email fallback for identity lookup during maintainer update Signed-off-by: Mouad BANI --- .../services/maintainer/maintainer_service.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index 88d5e5ec57..d52c99df86 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -188,14 +188,18 @@ async def compare_and_update_maintainers( for github_username, maintainer in new_maintainers_dict.items(): role = maintainer.normalized_title original_role = self.make_role(maintainer.title) - if github_username == "unknown": + if github_username == "unknown" and maintainer.email in ("unknown", None): self.logger.warning( - f"Skipping unkown github_username with title {maintainer.title}" + f"Skipping unknown github_username & email with title {maintainer.title}" ) continue elif github_username not in current_maintainers_dict: # New maintainer - identity_id = await find_github_identity(github_username) + identity_id = ( + await find_github_identity(github_username) + if github_username != "unknown" + else await find_maintainer_identity_by_email(maintainer.email) + ) self.logger.info(f"Found new maintainer {github_username} to be inserted") if identity_id: await upsert_maintainer( @@ -205,7 +209,7 @@ async def compare_and_update_maintainers( f"Successfully inserted new maintainer {github_username} with identity_id {identity_id}" ) else: - # will happend for new users if their identity isn't created yet but should fixed on the next iteration + # will happen for new users if their identity isn't created yet but should be fixed on the next iteration self.logger.warning(f"Identity not found for username: {github_username}") else: # Existing maintainer From 019f6df6c7a838e1b90e5aeee9559284ee6546e8 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 11 Mar 2026 13:45:05 +0000 Subject: [PATCH 07/16] chore: avoid bulding ai prompt when full content if batching is required Signed-off-by: Mouad BANI --- .../src/crowdgit/services/maintainer/maintainer_service.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index d52c99df86..b89923c21c 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -309,7 +309,6 @@ def get_extraction_prompt(self, filename: str, content_to_analyze: str) -> str: """ async def analyze_file_content(self, maintainer_filename: str, content: str): - prompt = self.get_extraction_prompt(maintainer_filename, content) if len(content) > self.MAX_CHUNK_SIZE: self.logger.info( "Maintainers file content exceeded max chunk size, splitting into chunks" @@ -353,7 +352,10 @@ async def process_chunk(chunk_index: int, chunk: str): aggregated_info.cost += chunk_info.cost maintainer_info = aggregated_info else: - maintainer_info = await invoke_bedrock(prompt, pydantic_model=MaintainerInfo) + maintainer_info = await invoke_bedrock( + self.get_extraction_prompt(maintainer_filename, content), + pydantic_model=MaintainerInfo, + ) self.logger.info("Maintainers file content analyzed by AI") self.logger.info(f"Maintainers response: {maintainer_info}") if maintainer_info.output.info is not None: From f284e8ed0df89f5386f65657060a18df04b8062f Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 11 Mar 2026 13:46:19 +0000 Subject: [PATCH 08/16] fix: remove duplicate rg pattern Signed-off-by: Mouad BANI --- .../crowdgit/services/maintainer/maintainer_service.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index b89923c21c..93bdad5f32 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -447,14 +447,7 @@ async def _ripgrep_search(self, repo_path: str) -> list[str]: """Search for files whose basename matches a governance stem, at any depth.""" glob_args = ["--glob", "!.git/"] for stem in self.GOVERNANCE_STEMS: - glob_args.extend( - [ - "--iglob", - f"*{stem}*", - "--iglob", - f"*{stem}*.*", - ] - ) + glob_args.extend(["--iglob", f"*{stem}*"]) try: output = await run_shell_command( From 969944ee197f042dbaccfc0f7798a2854474bb2b Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 11 Mar 2026 13:54:24 +0000 Subject: [PATCH 09/16] chore: add extra validation for reamde files to have maintainer keyword in content Signed-off-by: Mouad BANI --- .../src/crowdgit/services/maintainer/maintainer_service.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index 93bdad5f32..5f772cc7df 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -533,6 +533,11 @@ async def analyze_and_build_result(self, filename: str, content: str) -> Maintai Raises MaintanerAnalysisError if no maintainers are found. """ self.logger.info(f"Analyzing maintainer file: {filename}") + if "readme" in filename.lower() and "maintainer" not in content.lower(): + self.logger.warning( + f"Skipping README file '{filename}': no 'maintainer' keyword found in content" + ) + raise MaintanerAnalysisError(error_code=ErrorCode.NO_MAINTAINER_FOUND) result = await self.analyze_file_content(filename, content) if not result.output.info: From 77407152ca8b0f66a5839eeb8f126ca9be72b8df Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 11 Mar 2026 14:32:41 +0000 Subject: [PATCH 10/16] feat: improve ai fallback detection by passing scored candidates and improve prompt Signed-off-by: Mouad BANI --- .../services/maintainer/maintainer_service.py | 56 +++++++++++++++---- 1 file changed, 44 insertions(+), 12 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index 5f772cc7df..37c45ca291 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -46,6 +46,7 @@ class MaintainerService(BaseService): MAX_CHUNK_SIZE = 5000 MAX_CONCURRENT_CHUNKS = 3 + MAX_AI_FILE_LIST_SIZE = 300 # Full paths that get the highest score bonus when matched exactly KNOWN_PATHS = { @@ -376,33 +377,40 @@ async def process_chunk(chunk_index: int, chunk: str): ai_cost=maintainer_info.cost, ) - def get_maintainer_file_prompt(self, example_files: list[str], file_names: list[str]) -> str: + def get_maintainer_file_prompt( + self, example_files: list[str], candidates: list[tuple[str, int]] + ) -> str: """ Generates the prompt for the LLM to identify a maintainer file from a list. + candidates: list of (filename, score) where score reflects name-match strength. """ example_files_str = "\n".join(f"- {name}" for name in example_files) - file_names_str = "\n".join(f"- {name}" for name in file_names) + candidates_str = "\n".join(f"- {name} [score={score}]" for name, score in candidates) return f""" - You are an expert AI assistant specializing in identifying repository governance files. Your task is to find a maintainer file from a given list of filenames. + You are an expert AI assistant specializing in identifying repository governance files. Your task is to find the single best maintainer file from a given list of candidates. - 1. **Analyze the Input**: Carefully review the list of filenames provided in the `` tag. - 2. **Identify a Maintainer File**: Compare each filename against the characteristics of a maintainer file. These files typically define project ownership, governance, or code owners. Use the `` as a guide. - 3. **Apply Rules**: Follow all constraints listed in the `` section, especially the exclusion rule. - 4. **Select the First Match**: Scan the list and select the *first* filename that you identify as a maintainer file. You only need to find one. Once a match is found, stop searching. + 1. **Analyze the Input**: Carefully review the list of candidates in the `` tag. Each entry shows the file path and a pre-computed name-match score. + 2. **Identify the Best Maintainer File**: Compare each candidate against the characteristics of a maintainer file. These files typically define project ownership, governance, or code owners. Use the `` as a guide. + 3. **Use Signals to Rank**: When multiple candidates qualify, prefer: + - Higher **score** — stronger filename match against known governance patterns. + - Fewer path separators (`/`) in the path — files closer to the repo root apply to the whole project; deeply nested files are usually component-specific. + - When score and nesting conflict, prefer the file most likely to be the repo-wide governance file. + 4. **Apply Rules**: Follow all constraints listed in the `` section. 5. **Format the Output**: Return your answer as a single JSON object according to the `` specification, and nothing else. - **Definition**: A maintainer file's name usually contains keywords like `MAINTAINERS`, `CODEOWNERS`, or `OWNERS`. - **Exclusion**: The filename `CONTRIBUTING.md` must ALWAYS be ignored and never selected, even if it's the only file that seems relevant. + - **Third-party exclusion**: Do NOT select files that are inside directories associated with vendored dependencies, third-party libraries, or packages consumed by the project (e.g. paths containing `vendor/`, `node_modules/`, `third_party/`, `external/`, `.cache/`, `dist/`, `site-packages/`). These files belong to external projects, not this repository's own governance. - **No Match**: If no file in the list matches the criteria after checking all of them, you must return the 'not_found' error. - **Empty Input**: If the `` is empty or contains no filenames, you must return the 'not_found' error. - - **If a maintainer file is found**: Return a JSON object in the format `{{"file_name": ""}}`. + - **If a maintainer file is found**: Return a JSON object in the format `{{"file_name": ""}}`. - **If no maintainer file is found**: Return a JSON object in the format `{{"error": "not_found"}}`. @@ -411,15 +419,18 @@ def get_maintainer_file_prompt(self, example_files: list[str], file_names: list[ - {file_names_str} + {candidates_str} Return only the final JSON object. """ - async def find_maintainer_file_with_ai(self, file_names): + async def find_maintainer_file_with_ai( + self, candidates: list[tuple[str, int]] + ) -> tuple[str | None, float]: + """Ask AI to select the best maintainer file from scored candidates.""" self.logger.info("Using AI to find maintainer files...") - prompt = self.get_maintainer_file_prompt(sorted(self.KNOWN_PATHS), file_names) + prompt = self.get_maintainer_file_prompt(sorted(self.KNOWN_PATHS), candidates) result = await invoke_bedrock(prompt, pydantic_model=MaintainerFile) if result.output.file_name is not None: @@ -613,6 +624,7 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult: candidate_files = [(path, score) for path, _, score in candidates] # Step 3: Try AI analysis on top candidate + failed_candidate: str | None = None if candidates: filename, content, _ = candidates[0] try: @@ -625,13 +637,33 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult: except Exception as e: self.logger.warning(f"Unexpected error analyzing '{filename}': {repr(e)}") + failed_candidate = filename self.logger.warning("Top candidate failed, trying AI file detection") else: self.logger.warning("No candidate files found via search, trying AI file detection") # Step 4: AI file detection as last resort file_names = await self._list_repo_files(repo_path) - ai_file_name, ai_cost = await self.find_maintainer_file_with_ai(file_names) + # Pre-filter to governance-scored files to keep the AI prompt within model limits. + # Fall back to a hard-capped slice of the full list if nothing scores. + # Exclude the already-failed top candidate to avoid re-suggesting it. + scored_tuples = [ + (f, self._score_filename(f)) + for f in file_names + if self._score_filename(f) > 0 and f != failed_candidate + ] + ai_input_files: list[tuple[str, int]] = ( + scored_tuples + if scored_tuples + else [ + (f, 0) for f in file_names[: self.MAX_AI_FILE_LIST_SIZE] if f != failed_candidate + ] + ) + self.logger.info( + f"Passing {len(ai_input_files)} files to AI for maintainer file detection " + f"(total repo files: {len(file_names)})" + ) + ai_file_name, ai_cost = await self.find_maintainer_file_with_ai(ai_input_files) ai_suggested_file = ai_file_name total_cost += ai_cost From 684c85e1180bdab3f151236a60646c94238d7239 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 11 Mar 2026 14:34:35 +0000 Subject: [PATCH 11/16] chore: limit candiate_files saved in db to 100 Signed-off-by: Mouad BANI --- .../src/crowdgit/services/maintainer/maintainer_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index 37c45ca291..82036590e3 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -621,7 +621,7 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult: # Step 2: Find top candidate via filename search + scoring candidates = await self.find_candidate_files(repo_path) - candidate_files = [(path, score) for path, _, score in candidates] + candidate_files = [(path, score) for path, _, score in candidates][:100] # Step 3: Try AI analysis on top candidate failed_candidate: str | None = None From d908df5d176541bcab9b14bfdca60473189fdd47 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 11 Mar 2026 17:17:56 +0000 Subject: [PATCH 12/16] chore: add extra filename & stems Signed-off-by: Mouad BANI --- .../src/crowdgit/services/maintainer/maintainer_service.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index 82036590e3..a5f4d2f1f9 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -66,6 +66,7 @@ class MaintainerService(BaseService): ".github/maintainers.md", ".github/contributors.md", ".github/codeowners", + "SECURITY-INSIGHTS.md", } # Governance stems (basename without extension, lowercased) for filename search @@ -90,6 +91,7 @@ class MaintainerService(BaseService): "core_team", "code_owners", "emeritus", + "workgroup", } VALID_EXTENSIONS = { From fa2dd367cacb099029914758b518201441c19e11 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Fri, 13 Mar 2026 17:01:54 +0000 Subject: [PATCH 13/16] feat: analyze all root files combined and fallback to top one in subdirectories Signed-off-by: Mouad BANI --- .../services/maintainer/maintainer_service.py | 125 +++++++++++++----- 1 file changed, 93 insertions(+), 32 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index a5f4d2f1f9..c3590b4b54 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -456,15 +456,21 @@ async def _list_repo_files(self, repo_path: str) -> list[str]: if line.strip() ] - async def _ripgrep_search(self, repo_path: str) -> list[str]: - """Search for files whose basename matches a governance stem, at any depth.""" + async def _ripgrep_search(self, repo_path: str, max_depth: int | None = None) -> list[str]: + """Search for files whose basename matches a governance stem. + + Args: + max_depth: If set, passed as --max-depth to ripgrep (1 = repo root files only). + """ glob_args = ["--glob", "!.git/"] for stem in self.GOVERNANCE_STEMS: glob_args.extend(["--iglob", f"*{stem}*"]) + depth_args = ["--max-depth", str(max_depth)] if max_depth is not None else [] + try: output = await run_shell_command( - ["rg", "--files", "--hidden", *glob_args, "."], cwd=repo_path + ["rg", "--files", "--hidden", *depth_args, *glob_args, "."], cwd=repo_path ) except CommandExecutionError: self.logger.info("Ripgrep found no governance files by filename") @@ -503,17 +509,26 @@ def _score_filename(self, candidate_path: str) -> int: return self.PARTIAL_STEM_SCORE return 0 - async def find_candidate_files(self, repo_path: str) -> list[tuple[str, str, int]]: + async def find_candidate_files( + self, repo_path: str + ) -> tuple[list[tuple[str, str, int]], list[tuple[str, str, int]]]: """ - Find governance files by filename, score them, and return all candidates sorted by score. + Find governance files by filename, score them, and return (root_candidates, subdir_candidates). + + Root candidates are files directly in the repo root (max-depth 0). + Subdir candidates are files in subdirectories. + Both lists are sorted by score descending. Scoring: full known-path match (100) > exact stem (50) > partial stem (25) + content keywords (+1 each). """ - found_paths = await self._ripgrep_search(repo_path) - if not found_paths: - return [] + root_paths = set(await self._ripgrep_search(repo_path, max_depth=1)) + all_paths = await self._ripgrep_search(repo_path) + if not all_paths: + return [], [] + + root_scored: list[tuple[str, str, int]] = [] + subdir_scored: list[tuple[str, str, int]] = [] - scored = [] - for candidate_path in found_paths: + for candidate_path in all_paths: file_path = os.path.join(repo_path, candidate_path) try: async with aiofiles.open(file_path, "r", encoding="utf-8") as f: @@ -526,19 +541,24 @@ async def find_candidate_files(self, repo_path: str) -> list[tuple[str, str, int content_score = sum(1 for kw in self.SCORING_KEYWORDS if kw in content.lower()) total = filename_score + content_score - scored.append((candidate_path, content, total)) + entry = (candidate_path, content, total) + if candidate_path in root_paths: + root_scored.append(entry) + else: + subdir_scored.append(entry) + self.logger.info( f"Candidate: {candidate_path} " f"(filename: {filename_score}, content: {content_score}, total: {total})" ) - scored.sort(key=lambda c: c[2], reverse=True) + root_scored.sort(key=lambda c: c[2], reverse=True) + subdir_scored.sort(key=lambda c: c[2], reverse=True) - if scored: - self.logger.info(f"Top candidate: {scored[0][0]} (from {len(scored)} total)") - else: - self.logger.info("No valid candidates after scoring") - return scored + self.logger.info( + f"Found {len(root_scored)} root candidate(s) and {len(subdir_scored)} subdirectory candidate(s)" + ) + return root_scored, subdir_scored async def analyze_and_build_result(self, filename: str, content: str) -> MaintainerResult: """ @@ -621,14 +641,56 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult: return _attach_metadata(result) self.logger.info("Falling back to maintainer file detection") - # Step 2: Find top candidate via filename search + scoring - candidates = await self.find_candidate_files(repo_path) - candidate_files = [(path, score) for path, _, score in candidates][:100] + # Step 2: Find candidates via filename search + scoring, split by depth + root_candidates, subdir_candidates = await self.find_candidate_files(repo_path) + all_candidates = root_candidates + subdir_candidates + candidate_files = [(path, score) for path, _, score in all_candidates][:100] + + # Step 3: Try root-level files first (in score order), then top subdirectory file + failed_candidates: set[str] = set() + + if not all_candidates: + self.logger.warning("No candidate files found via search, trying AI file detection") + + combined_info: list = [] + best_file: str | None = None + best_file_count: int = 0 + + for filename, content, _ in root_candidates: + try: + result = await self.analyze_and_build_result(filename, content) + total_cost += result.total_cost + file_info = result.maintainer_info or [] + combined_info.extend(file_info) + if len(file_info) > best_file_count: + best_file = filename + best_file_count = len(file_info) + except MaintanerAnalysisError as e: + total_cost += e.ai_cost + self.logger.warning( + f"AI analysis failed for root file '{filename}': {e.error_message}" + ) + except Exception as e: + self.logger.warning( + f"Unexpected error analyzing root file '{filename}': {repr(e)}" + ) + failed_candidates.add(filename) + + if combined_info: + return _attach_metadata( + MaintainerResult( + maintainer_file=best_file, + maintainer_info=combined_info, + ) + ) - # Step 3: Try AI analysis on top candidate - failed_candidate: str | None = None - if candidates: - filename, content, _ = candidates[0] + if root_candidates and subdir_candidates: + self.logger.warning("All root candidates failed, trying top subdirectory candidate") + elif root_candidates: + self.logger.warning("All root candidates failed, trying AI file detection") + + if subdir_candidates: + filename, content, _ = subdir_candidates[0] try: result = await self.analyze_and_build_result(filename, content) total_cost += result.total_cost @@ -638,27 +700,26 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult: self.logger.warning(f"AI analysis failed for '{filename}': {e.error_message}") except Exception as e: self.logger.warning(f"Unexpected error analyzing '{filename}': {repr(e)}") - - failed_candidate = filename - self.logger.warning("Top candidate failed, trying AI file detection") - else: - self.logger.warning("No candidate files found via search, trying AI file detection") + failed_candidates.add(filename) + self.logger.warning("Top subdirectory candidate failed, trying AI file detection") # Step 4: AI file detection as last resort file_names = await self._list_repo_files(repo_path) # Pre-filter to governance-scored files to keep the AI prompt within model limits. # Fall back to a hard-capped slice of the full list if nothing scores. - # Exclude the already-failed top candidate to avoid re-suggesting it. + # Exclude all already-failed candidates to avoid re-suggesting them. scored_tuples = [ (f, self._score_filename(f)) for f in file_names - if self._score_filename(f) > 0 and f != failed_candidate + if self._score_filename(f) > 0 and f not in failed_candidates ] ai_input_files: list[tuple[str, int]] = ( scored_tuples if scored_tuples else [ - (f, 0) for f in file_names[: self.MAX_AI_FILE_LIST_SIZE] if f != failed_candidate + (f, 0) + for f in file_names[: self.MAX_AI_FILE_LIST_SIZE] + if f not in failed_candidates ] ) self.logger.info( From 2cd6f467decb2afb3935e4b29befac730562cf5c Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Thu, 26 Mar 2026 11:48:52 +0100 Subject: [PATCH 14/16] fix: improve extensionless file detection & debug logs Signed-off-by: Mouad BANI --- .../services/maintainer/maintainer_service.py | 41 +++++++++++++++---- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index c3590b4b54..79cd216d9d 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -66,7 +66,7 @@ class MaintainerService(BaseService): ".github/maintainers.md", ".github/contributors.md", ".github/codeowners", - "SECURITY-INSIGHTS.md", + "security-insights.md", } # Governance stems (basename without extension, lowercased) for filename search @@ -445,7 +445,8 @@ async def _list_repo_files(self, repo_path: str) -> list[str]: """List non-code files in the repo recursively, filtered by VALID_EXTENSIONS.""" glob_args = ["--glob", "!.git/"] for ext in self.VALID_EXTENSIONS: - glob_args.extend(["--iglob", f"*{ext}"]) + if ext: + glob_args.extend(["--iglob", f"*{ext}"]) output = await run_shell_command( ["rg", "--files", "--hidden", *glob_args, "."], cwd=repo_path @@ -453,7 +454,7 @@ async def _list_repo_files(self, repo_path: str) -> list[str]: return [ line[2:] if line.startswith("./") else line for line in output.strip().split("\n") - if line.strip() + if line.strip() and os.path.splitext(line)[1] in self.VALID_EXTENSIONS ] async def _ripgrep_search(self, repo_path: str, max_depth: int | None = None) -> list[str]: @@ -475,6 +476,16 @@ async def _ripgrep_search(self, repo_path: str, max_depth: int | None = None) -> except CommandExecutionError: self.logger.info("Ripgrep found no governance files by filename") return [] + except FileNotFoundError as e: + if not os.path.isdir(repo_path): + self.logger.warning( + f"Ripgrep search failed: repo_path does not exist: '{repo_path}'" + ) + else: + self.logger.warning( + f"Ripgrep search failed: 'rg' binary not found in PATH. Install ripgrep. ({repr(e)})" + ) + return [] except Exception as e: self.logger.warning(f"Ripgrep search failed: {repr(e)}") return [] @@ -488,9 +499,11 @@ async def _ripgrep_search(self, repo_path: str, max_depth: int | None = None) -> line = line[2:] basename = os.path.basename(line).lower() if basename in self.EXCLUDED_FILENAMES: + self.logger.debug(f"Excluding '{line}': basename in EXCLUDED_FILENAMES") continue ext = os.path.splitext(basename)[1] if ext not in self.VALID_EXTENSIONS: + self.logger.debug(f"Excluding '{line}': extension '{ext}' not in VALID_EXTENSIONS") continue results.append(line) @@ -547,9 +560,9 @@ async def find_candidate_files( else: subdir_scored.append(entry) - self.logger.info( + self.logger.debug( f"Candidate: {candidate_path} " - f"(filename: {filename_score}, content: {content_score}, total: {total})" + f"(filename_score={filename_score}, content_score={content_score}, total={total})" ) root_scored.sort(key=lambda c: c[2], reverse=True) @@ -591,6 +604,7 @@ async def try_saved_maintainer_file( """ cost = 0.0 file_path = os.path.join(repo_path, saved_maintainer_file) + self.logger.debug(f"Checking saved maintainer file on disk: '{file_path}'") if not await aiofiles.os.path.isfile(file_path): self.logger.warning( @@ -598,6 +612,7 @@ async def try_saved_maintainer_file( ) return None, cost + self.logger.debug(f"Saved maintainer file exists, reading content: '{saved_maintainer_file}'") try: async with aiofiles.open(file_path, "r", encoding="utf-8") as f: content = await f.read() @@ -645,6 +660,12 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult: root_candidates, subdir_candidates = await self.find_candidate_files(repo_path) all_candidates = root_candidates + subdir_candidates candidate_files = [(path, score) for path, _, score in all_candidates][:100] + self.logger.debug( + f"Detection step 2: {len(root_candidates)} root candidate(s), " + f"{len(subdir_candidates)} subdir candidate(s); " + f"root={[p for p, _, _ in root_candidates]}, " + f"subdir_top={[p for p, _, _ in subdir_candidates[:3]]}" + ) # Step 3: Try root-level files first (in score order), then top subdirectory file failed_candidates: set[str] = set() @@ -656,7 +677,8 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult: best_file: str | None = None best_file_count: int = 0 - for filename, content, _ in root_candidates: + for filename, content, score in root_candidates: + self.logger.debug(f"Detection step 3: trying root candidate '{filename}' (score={score})") try: result = await self.analyze_and_build_result(filename, content) total_cost += result.total_cost @@ -690,7 +712,8 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult: self.logger.warning("All root candidates failed, trying AI file detection") if subdir_candidates: - filename, content, _ = subdir_candidates[0] + filename, content, score = subdir_candidates[0] + self.logger.debug(f"Detection step 3b: trying top subdir candidate '{filename}' (score={score})") try: result = await self.analyze_and_build_result(filename, content) total_cost += result.total_cost @@ -726,10 +749,12 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult: f"Passing {len(ai_input_files)} files to AI for maintainer file detection " f"(total repo files: {len(file_names)})" ) + self.logger.debug(f"AI input files: {[f for f, _ in ai_input_files]}") ai_file_name, ai_cost = await self.find_maintainer_file_with_ai(ai_input_files) ai_suggested_file = ai_file_name total_cost += ai_cost + self.logger.debug(f"AI suggested file: '{ai_file_name}' (cost={ai_cost:.4f})") if ai_file_name: file_path = os.path.join(repo_path, ai_file_name) if not await aiofiles.os.path.isfile(file_path): @@ -826,7 +851,7 @@ async def process_maintainers( ai_cost = 0.0 maintainers_found = 0 maintainers_skipped = 0 - candidate_files: list[str] = [] + candidate_files: list[tuple[str, int]] = [] ai_suggested_file: str | None = None try: From 1a6a08d93b94845b80078e820febf08e57176e92 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Thu, 26 Mar 2026 13:25:20 +0100 Subject: [PATCH 15/16] feat: improve reamde.me handling Signed-off-by: Mouad BANI --- .../src/crowdgit/models/maintainer_info.py | 1 + .../services/maintainer/maintainer_service.py | 29 ++++++++++++++----- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/models/maintainer_info.py b/services/apps/git_integration/src/crowdgit/models/maintainer_info.py index 1752999e54..9a298c7820 100644 --- a/services/apps/git_integration/src/crowdgit/models/maintainer_info.py +++ b/services/apps/git_integration/src/crowdgit/models/maintainer_info.py @@ -36,3 +36,4 @@ class MaintainerResult(BaseModel): total_cost: float = 0 candidate_files: list[tuple[str, int]] = [] ai_suggested_file: str | None = None + not_found: bool = False diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index 79cd216d9d..455500bb3e 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -67,6 +67,7 @@ class MaintainerService(BaseService): ".github/contributors.md", ".github/codeowners", "security-insights.md", + "readme.md", } # Governance stems (basename without extension, lowercased) for filename search @@ -579,9 +580,11 @@ async def analyze_and_build_result(self, filename: str, content: str) -> Maintai Raises MaintanerAnalysisError if no maintainers are found. """ self.logger.info(f"Analyzing maintainer file: {filename}") - if "readme" in filename.lower() and "maintainer" not in content.lower(): + if "readme" in filename.lower() and not any( + kw in content.lower() for kw in self.SCORING_KEYWORDS + ): self.logger.warning( - f"Skipping README file '{filename}': no 'maintainer' keyword found in content" + f"Skipping README file '{filename}': no governance keyword found in content" ) raise MaintanerAnalysisError(error_code=ErrorCode.NO_MAINTAINER_FOUND) result = await self.analyze_file_content(filename, content) @@ -612,7 +615,9 @@ async def try_saved_maintainer_file( ) return None, cost - self.logger.debug(f"Saved maintainer file exists, reading content: '{saved_maintainer_file}'") + self.logger.debug( + f"Saved maintainer file exists, reading content: '{saved_maintainer_file}'" + ) try: async with aiofiles.open(file_path, "r", encoding="utf-8") as f: content = await f.read() @@ -678,7 +683,9 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult: best_file_count: int = 0 for filename, content, score in root_candidates: - self.logger.debug(f"Detection step 3: trying root candidate '{filename}' (score={score})") + self.logger.debug( + f"Detection step 3: trying root candidate '{filename}' (score={score})" + ) try: result = await self.analyze_and_build_result(filename, content) total_cost += result.total_cost @@ -713,7 +720,9 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult: if subdir_candidates: filename, content, score = subdir_candidates[0] - self.logger.debug(f"Detection step 3b: trying top subdir candidate '{filename}' (score={score})") + self.logger.debug( + f"Detection step 3b: trying top subdir candidate '{filename}' (score={score})" + ) try: result = await self.analyze_and_build_result(filename, content) total_cost += result.total_cost @@ -754,7 +763,7 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult: ai_suggested_file = ai_file_name total_cost += ai_cost - self.logger.debug(f"AI suggested file: '{ai_file_name}' (cost={ai_cost:.4f})") + self.logger.info(f"AI suggested file: '{ai_file_name}' (cost={ai_cost:.4f})") if ai_file_name: file_path = os.path.join(repo_path, ai_file_name) if not await aiofiles.os.path.isfile(file_path): @@ -775,7 +784,7 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult: ) self.logger.error("No maintainer file found") - raise MaintainerFileNotFoundError(ai_cost=total_cost) + return _attach_metadata(MaintainerResult(total_cost=total_cost, not_found=True)) async def check_if_interval_elapsed(self, repository: Repository) -> tuple[bool, float]: """ @@ -870,10 +879,14 @@ async def process_maintainers( ) latest_maintainer_file = maintainers.maintainer_file ai_cost = maintainers.total_cost - maintainers_found = len(maintainers.maintainer_info) candidate_files = maintainers.candidate_files ai_suggested_file = maintainers.ai_suggested_file + if maintainers.not_found: + raise MaintainerFileNotFoundError(ai_cost=ai_cost) + + maintainers_found = len(maintainers.maintainer_info) + if repository.parent_repo: filtered_maintainers = await self.exclude_parent_repo_maintainers( repository.parent_repo, maintainers.maintainer_info From bbacdfa31d1156c862876850f2d93a44fd0ceecc Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Thu, 26 Mar 2026 13:49:19 +0100 Subject: [PATCH 16/16] fix: undo extensionless files optimization Signed-off-by: Mouad BANI --- .../src/crowdgit/services/maintainer/maintainer_service.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index 455500bb3e..e859d085fa 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -446,8 +446,7 @@ async def _list_repo_files(self, repo_path: str) -> list[str]: """List non-code files in the repo recursively, filtered by VALID_EXTENSIONS.""" glob_args = ["--glob", "!.git/"] for ext in self.VALID_EXTENSIONS: - if ext: - glob_args.extend(["--iglob", f"*{ext}"]) + glob_args.extend(["--iglob", f"*{ext}"]) output = await run_shell_command( ["rg", "--files", "--hidden", *glob_args, "."], cwd=repo_path @@ -455,7 +454,7 @@ async def _list_repo_files(self, repo_path: str) -> list[str]: return [ line[2:] if line.startswith("./") else line for line in output.strip().split("\n") - if line.strip() and os.path.splitext(line)[1] in self.VALID_EXTENSIONS + if line.strip() ] async def _ripgrep_search(self, repo_path: str, max_depth: int | None = None) -> list[str]: