diff --git a/.tekton/on-pull-request.yaml b/.tekton/on-pull-request.yaml index 30c99ba4d..426d3312e 100644 --- a/.tekton/on-pull-request.yaml +++ b/.tekton/on-pull-request.yaml @@ -251,6 +251,7 @@ spec: # This is handled in the Makefile's lint-pr target and should be reverted after migration. make lint-pr TARGET_BRANCH=$TARGET_BRANCH_NAME + print_banner "RUNNING UNIT TESTS" make test-unit PYTEST_OPTS="--log-cli-level=DEBUG" diff --git a/Dockerfile b/Dockerfile index fa6e6d652..45f492fc9 100755 --- a/Dockerfile +++ b/Dockerfile @@ -32,7 +32,13 @@ ENV PYTHONDONTWRITEBYTECODE=1 ENV AGENT_GIT_COMMIT=${AGENT_GIT_COMMIT} ENV AGENT_GIT_TAG=${AGENT_GIT_TAG} +# System dependencies: +# - build-essential: Required for compiling Python packages with native C extensions +# (e.g., psutil, cryptography, cffi) during `uv sync`. Also needed for libkrb5-dev. +# - libarchive-tools: Provides bsdtar for extracting RPM/SRPM archives in the checker +# - libkrb5-dev: Kerberos headers for gssapi Python package (Brew authentication) RUN apt-get update && apt-get install -y \ + build-essential \ ca-certificates \ curl \ git \ @@ -42,6 +48,7 @@ RUN apt-get update && apt-get install -y \ libarchive-tools \ xz-utils \ libatomic1 \ + libkrb5-dev \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* \ && update-ca-certificates diff --git a/README.md b/README.md index 659a6a456..f4fab8a9f 100644 --- a/README.md +++ b/README.md @@ -192,6 +192,7 @@ The current default embedding NIM model is `nv-embedqa-e5-v5`, which was selecte * [git](https://git-scm.com/) * [git-lfs](https://git-lfs.com/) +* bsdtar (from libarchive) - Used for extracting RPM source archives. Install via `apt install libarchive-tools` (Debian/Ubuntu) or `dnf install bsdtar` (Fedora/RHEL). * Since the workflow uses [NVIDIA NeMo Agent Toolkit](https://docs.nvidia.com/aiqtoolkit), the [NeMo Agent toolkit requirements](https://docs.nvidia.com/aiqtoolkit/latest/quick-start/installing.html#prerequisites) also need to be installed. ### Obtain API keys diff --git a/kustomize/base/exploit-iq-config.yml b/kustomize/base/exploit-iq-config.yml index b9eab81d5..abfa7d15b 100644 --- a/kustomize/base/exploit-iq-config.yml +++ b/kustomize/base/exploit-iq-config.yml @@ -84,6 +84,11 @@ functions: Code Keyword Search: _type: lexical_code_search top_k: 5 + Source Grep: + _type: source_grep + base_checker_dir: ${EXPLOIT_IQ_DATA_DIR:-/exploit-iq-data/}checker + max_results: 50 + context_lines: 2 CVE Web Search: _type: serp_wrapper max_retries: 5 @@ -156,6 +161,38 @@ functions: generate_intel_score: true intel_low_score: 51 insist_analysis: false + cve_source_acquisition: + _type: cve_source_acquisition + base_git_dir: ${EXPLOIT_IQ_DATA_DIR:-/exploit-iq-data/}git + base_pickle_dir: ${EXPLOIT_IQ_DATA_DIR:-/exploit-iq-data/}pickle + base_rpm_dir: ${EXPLOIT_IQ_DATA_DIR:-/exploit-iq-data/}rpms + base_checker_dir: ${EXPLOIT_IQ_DATA_DIR:-/exploit-iq-data/}checker + rpm_user_type: ${RPM_USER_TYPE:-internal} + cve_checker_segmentation: + _type: cve_checker_segmentation + base_checker_dir: ${EXPLOIT_IQ_DATA_DIR:-/exploit-iq-data/}checker + base_code_index_dir: ${EXPLOIT_IQ_DATA_DIR:-/exploit-iq-data/}code_index + cve_package_code_agent: + _type: cve_package_code_agent + llm_name: cve_agent_executor_llm + base_checker_dir: ${EXPLOIT_IQ_DATA_DIR:-/exploit-iq-data/}checker + base_code_index_dir: ${EXPLOIT_IQ_DATA_DIR:-/exploit-iq-data/}code_index + rpm_user_type: ${RPM_USER_TYPE:-internal} + tool_names: + - Source Grep + - Code Keyword Search + cve_checker_report: + _type: cve_checker_report + llm_name: cve_agent_executor_llm + base_checker_dir: ${EXPLOIT_IQ_DATA_DIR:-/exploit-iq-data/}checker + cve_build_agent: + _type: cve_build_agent + llm_name: cve_agent_executor_llm + base_checker_dir: ${EXPLOIT_IQ_DATA_DIR:-/exploit-iq-data/}checker + max_iterations: 10 + tool_names: + - Source Grep + - Code Keyword Search health_check: _type: health_check @@ -248,6 +285,11 @@ workflow: cve_summarize_name: cve_summarize cve_justify_name: cve_justify cve_output_config_name: cve_http_output + cve_source_acquisition_name: cve_source_acquisition + cve_checker_segmentation_name: cve_checker_segmentation + cve_package_code_agent_name: cve_package_code_agent + cve_checker_report_name: cve_checker_report + cve_build_agent_name: cve_build_agent eval: general: diff --git a/kustomize/base/exploit_iq_service.yaml b/kustomize/base/exploit_iq_service.yaml index 655005ab6..b3ab3677a 100644 --- a/kustomize/base/exploit_iq_service.yaml +++ b/kustomize/base/exploit_iq_service.yaml @@ -122,6 +122,8 @@ spec: value: "True" - name: EXPLOIT_IQ_DATA_DIR value: /exploit-iq-data/ + - name: RPM_USER_TYPE + value: "internal" - name: NAMESPACE valueFrom: fieldRef: diff --git a/pyproject.toml b/pyproject.toml index 0d4d36bd5..c3041e8be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,8 @@ dependencies = [ "litellm<=1.75.8", "csaf-tool==0.3.2", "jsonschema>=4.0.0,<5.0.0", + "koji", + "unidiff>=0.7.5", ] requires-python = ">=3.11,<3.13" description = "NVIDIA AI Blueprint: Vulnerability Analysis for Container Security" diff --git a/src/exploit_iq_commons/data/hardening_kb/__init__.py b/src/exploit_iq_commons/data/hardening_kb/__init__.py new file mode 100644 index 000000000..cf7c586a5 --- /dev/null +++ b/src/exploit_iq_commons/data/hardening_kb/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/src/exploit_iq_commons/data/hardening_kb/hardening_kb.json b/src/exploit_iq_commons/data/hardening_kb/hardening_kb.json new file mode 100644 index 000000000..92ab9a8f9 --- /dev/null +++ b/src/exploit_iq_commons/data/hardening_kb/hardening_kb.json @@ -0,0 +1,395 @@ +{ + "kb_version": "1.1", + "last_updated": "2026-05-09", + "flag_type_definitions": { + "warning": "Compile-time warnings only. Does not add runtime protection. Use for 'Best Practices' audits, NOT for mitigation claims.", + "runtime": "Actual runtime protection or detection. Valid for 'Mitigated' status if maps to the specific CWE.", + "optimization": "Changes compiler optimization behavior but does not add runtime detection/prevention. Not valid for mitigation.", + "linker": "Linker-level hardening that affects runtime binary layout/behavior. Valid for mitigation if maps to CWE.", + "architecture": "Platform-specific build flag. Valid for mitigation ONLY if CVE advisory states the architecture is not affected." + }, + "mappings": [ + { + "flag": "-Wall -Wextra", + "flag_type": "warning", + "description": "Enable warnings for constructs often associated with defects.", + "vulnerability_category": "Defensive Coding", + "cwe_ids": [ + "CWE-563", + "CWE-457", + "CWE-480" + ], + "requires": {} + }, + { + "flag": "-Wformat -Wformat=2", + "flag_type": "warning", + "description": "Enable additional format function warnings.", + "vulnerability_category": "Input Validation", + "cwe_ids": [ + "CWE-134" + ], + "requires": {} + }, + { + "flag": "-Wconversion -Wsign-conversion", + "flag_type": "warning", + "description": "Enable implicit conversion warnings.", + "vulnerability_category": "Arithmetic Safety", + "cwe_ids": [ + "CWE-190", + "CWE-681" + ], + "requires": {} + }, + { + "flag": "-Wtrampolines", + "flag_type": "warning", + "description": "Enable warnings about trampolines that require executable stacks.", + "vulnerability_category": "Control Flow Integrity", + "cwe_ids": [ + "CWE-693" + ], + "requires": {} + }, + { + "flag": "-Wimplicit-fallthrough", + "flag_type": "warning", + "description": "Warn when a switch case falls through.", + "vulnerability_category": "Defensive Coding", + "cwe_ids": [ + "CWE-484" + ], + "requires": {} + }, + { + "flag": "-Wbidi-chars=any", + "flag_type": "warning", + "description": "Enable warnings for possibly misleading Unicode bidirectional control characters.", + "vulnerability_category": "Code Integrity", + "cwe_ids": [ + "CWE-1301" + ], + "requires": {} + }, + { + "flag": "-Werror", + "flag_type": "warning", + "description": "Treat all or selected compiler warnings as errors.", + "vulnerability_category": "Policy Enforcement", + "cwe_ids": [ + "N/A" + ], + "requires": {} + }, + { + "flag": "-Werror=format-security", + "flag_type": "warning", + "description": "Treat format strings that are not string literals and used without arguments as errors.", + "vulnerability_category": "Input Validation", + "cwe_ids": [ + "CWE-134" + ], + "requires": {} + }, + { + "flag": "-Werror=implicit -Werror=incompatible-pointer-types -Werror=int-conversion", + "flag_type": "warning", + "description": "Treat obsolete C constructs as errors.", + "vulnerability_category": "Type Safety", + "cwe_ids": [ + "CWE-704", + "CWE-843" + ], + "requires": {} + }, + { + "flag": "-D_FORTIFY_SOURCE=3", + "flag_type": "runtime", + "description": "Fortify sources with compile- and run-time checks for unsafe libc usage and buffer overflows.", + "vulnerability_category": "Memory Safety", + "cwe_ids": [ + "CWE-119", + "CWE-120", + "CWE-121", + "CWE-122" + ], + "requires": {} + }, + { + "flag": "-D_FORTIFY_SOURCE=2", + "flag_type": "runtime", + "description": "Fortify sources with compile- and run-time checks for unsafe libc usage and buffer overflows (legacy level).", + "vulnerability_category": "Memory Safety", + "cwe_ids": [ + "CWE-119", + "CWE-120", + "CWE-121", + "CWE-122" + ], + "requires": {} + }, + { + "flag": "-D_GLIBCXX_ASSERTIONS", + "flag_type": "runtime", + "description": "Precondition checks for C++ standard library calls.", + "vulnerability_category": "Memory Safety", + "cwe_ids": [ + "CWE-119", + "CWE-125", + "CWE-787" + ], + "requires": {} + }, + { + "flag": "-fstrict-flex-arrays=3", + "flag_type": "runtime", + "description": "Consider a trailing array in a struct as a flexible array if declared as [].", + "vulnerability_category": "Memory Safety", + "cwe_ids": [ + "CWE-119", + "CWE-125", + "CWE-787" + ], + "requires": {} + }, + { + "flag": "-fstack-clash-protection", + "flag_type": "runtime", + "description": "Enable run-time checks for variable-size stack allocation validity.", + "vulnerability_category": "Memory Safety", + "cwe_ids": [ + "CWE-785" + ], + "requires": {} + }, + { + "flag": "-fstack-protector-strong", + "flag_type": "runtime", + "description": "Enable run-time checks for stack-based buffer overflows.", + "vulnerability_category": "Memory Safety", + "cwe_ids": [ + "CWE-121" + ], + "requires": {} + }, + { + "flag": "-fcf-protection=full", + "flag_type": "runtime", + "description": "Enable control-flow protection against return-oriented programming (ROP) and jump-oriented programming (JOP) attacks on x86_64.", + "vulnerability_category": "Control Flow Integrity", + "cwe_ids": [ + "CWE-693" + ], + "requires": {} + }, + { + "flag": "-mbranch-protection=standard", + "flag_type": "runtime", + "description": "Enable branch protection against ROP and JOP attacks on AArch64.", + "vulnerability_category": "Control Flow Integrity", + "cwe_ids": [ + "CWE-693" + ], + "requires": {} + }, + { + "flag": "-ftrapv", + "flag_type": "runtime", + "description": "Generate traps for signed arithmetic overflow on addition, subtraction, multiplication.", + "vulnerability_category": "Arithmetic Safety", + "cwe_ids": [ + "CWE-190", + "CWE-191" + ], + "requires": {} + }, + { + "flag": "-fsanitize=signed-integer-overflow", + "flag_type": "runtime", + "description": "Enable undefined behavior sanitizer for signed integer overflow detection.", + "vulnerability_category": "Arithmetic Safety", + "cwe_ids": [ + "CWE-190", + "CWE-191" + ], + "requires": {} + }, + { + "flag": "-fsanitize=unsigned-integer-overflow", + "flag_type": "runtime", + "description": "Enable undefined behavior sanitizer for unsigned integer overflow detection.", + "vulnerability_category": "Arithmetic Safety", + "cwe_ids": [ + "CWE-190", + "CWE-191" + ], + "requires": {} + }, + { + "flag": "-Wl,-z,nodlopen", + "flag_type": "linker", + "description": "Restrict dlopen(3) calls to shared objects.", + "vulnerability_category": "Policy Enforcement", + "cwe_ids": [ + "CWE-269" + ], + "requires": {} + }, + { + "flag": "-Wl,-z,noexecstack", + "flag_type": "linker", + "description": "Enable data execution prevention by marking stack memory as non-executable.", + "vulnerability_category": "Control Flow Integrity", + "cwe_ids": [ + "CWE-693", + "CWE-94" + ], + "requires": {} + }, + { + "flag": "-Wl,-z,relro -Wl,-z,now", + "flag_type": "linker", + "description": "Mark relocation table entries resolved at load-time as read-only.", + "vulnerability_category": "Code Integrity", + "cwe_ids": [ + "CWE-123" + ], + "requires": {} + }, + { + "flag": "-fPIE -pie", + "flag_type": "linker", + "description": "Build as position-independent executable.", + "vulnerability_category": "Control Flow Integrity", + "cwe_ids": [ + "CWE-693" + ], + "requires": {} + }, + { + "flag": "-fPIC -shared", + "flag_type": "linker", + "description": "Build as position-independent code.", + "vulnerability_category": "Control Flow Integrity", + "cwe_ids": [ + "CWE-693" + ], + "requires": {} + }, + { + "flag": "-fno-delete-null-pointer-checks", + "flag_type": "optimization", + "description": "Force retention of null pointer checks.", + "vulnerability_category": "Memory Safety", + "cwe_ids": [ + "CWE-476" + ], + "requires": {} + }, + { + "flag": "-fno-strict-overflow", + "flag_type": "optimization", + "description": "Do not assume signed integer overflow is undefined behavior. Prevents aggressive optimizations but does NOT add runtime detection.", + "vulnerability_category": "Arithmetic Safety", + "cwe_ids": [ + "CWE-190" + ], + "requires": {} + }, + { + "flag": "-fno-strict-aliasing", + "flag_type": "optimization", + "description": "Do not assume strict aliasing.", + "vulnerability_category": "Memory Safety", + "cwe_ids": [ + "CWE-416" + ], + "requires": {} + }, + { + "flag": "-ftrivial-auto-var-init", + "flag_type": "runtime", + "description": "Initialize automatic variables that lack explicit initializers.", + "vulnerability_category": "Information Leakage", + "cwe_ids": [ + "CWE-457" + ], + "requires": {} + }, + { + "flag": "-fexceptions", + "flag_type": "runtime", + "description": "Enable exception propagation to harden multi-threaded C code.", + "vulnerability_category": "Error Handling", + "cwe_ids": [ + "CWE-391" + ], + "requires": {} + }, + { + "flag": "-fhardened", + "flag_type": "runtime", + "description": "Enable pre-determined set of hardening options in GCC.", + "vulnerability_category": "Full Hardening", + "cwe_ids": [ + "Multi" + ], + "requires": {} + }, + { + "flag": "-Wl,--as-needed -Wl,--no-copy-dt-needed-entries", + "flag_type": "linker", + "description": "Allow linker to omit libraries specified on the command line to link against if they are not used.", + "vulnerability_category": "Supply Chain Safety", + "cwe_ids": [ + "N/A" + ], + "requires": {} + }, + { + "flag": "-fzero-init-padding-bits=all", + "flag_type": "runtime", + "description": "Guarantee zero initialization of padding bits in all automatic variable initializers.", + "vulnerability_category": "Information Leakage", + "cwe_ids": [ + "CWE-200" + ], + "requires": {} + }, + { + "flag": "-m64", + "flag_type": "architecture", + "description": "Compile for 64-bit x86_64 architecture. Many integer overflow vulnerabilities only affect 32-bit systems.", + "vulnerability_category": "Architecture", + "cwe_ids": [ + "CWE-190", + "CWE-680", + "CWE-681" + ], + "requires": { + "advisory_states": "Mitigation valid ONLY if CVE advisory explicitly states 64-bit systems are not affected." + } + }, + { + "flag": "-m32", + "flag_type": "architecture", + "description": "Compile for 32-bit i686 architecture.", + "vulnerability_category": "Architecture", + "cwe_ids": [], + "requires": { + "advisory_states": "Check CVE advisory for 32-bit specific vulnerabilities." + } + }, + { + "flag": "-march=", + "flag_type": "architecture", + "description": "Target specific CPU architecture. May affect vulnerability applicability.", + "vulnerability_category": "Architecture", + "cwe_ids": [], + "requires": { + "advisory_states": "Check CVE advisory for architecture-specific conditions." + } + } + ] +} diff --git a/src/exploit_iq_commons/data_models/checker_status.py b/src/exploit_iq_commons/data_models/checker_status.py new file mode 100644 index 000000000..029cf8245 --- /dev/null +++ b/src/exploit_iq_commons/data_models/checker_status.py @@ -0,0 +1,269 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from enum import Enum +from enum import IntEnum +from pathlib import Path +from typing import Any, Literal + +from pydantic import BaseModel, Field + + +class PackageCheckerStatus(IntEnum): + """Per-CVE status codes produced by the PackageIdentify phase.""" + OK = 0 + ERROR_PKG_IDENT_NO_INTEL = 1 + PKG_IDENT_NOT_VUL = 2 + ERROR_FAILED_TO_DOWNLOAD_SRPM = 3 + PKG_IDENT_CVE_MISMATCH = 4 + PKG_INTEL_LOW_SCORE = 5 + + +PACKAGE_CHECKER_STATUS_DESCRIPTIONS: dict[PackageCheckerStatus, str] = { + PackageCheckerStatus.OK: + "Package identified and in affected range -- continue investigation", + PackageCheckerStatus.ERROR_PKG_IDENT_NO_INTEL: + "No Intel found for the package", + PackageCheckerStatus.PKG_IDENT_NOT_VUL: + "Identification state concluded from intel that target package is not vulnerable", + PackageCheckerStatus.ERROR_FAILED_TO_DOWNLOAD_SRPM: + "Failed to download the patched SRPM", + PackageCheckerStatus.PKG_IDENT_CVE_MISMATCH: + "CVE does not apply to target package - RHSA does not list this package", + PackageCheckerStatus.PKG_INTEL_LOW_SCORE: + "Intel quality score below threshold - insufficient information for reliable analysis", +} + +CHECKER_FAILURE_ERROR_TYPES: dict[PackageCheckerStatus, str] = { + PackageCheckerStatus.ERROR_PKG_IDENT_NO_INTEL: "no-intel", + PackageCheckerStatus.ERROR_FAILED_TO_DOWNLOAD_SRPM: "srpm-download-failed", + PackageCheckerStatus.PKG_IDENT_CVE_MISMATCH: "invalid-input", +} + + +class EnumIdentifyResult(str, Enum): + """Result of the PackageIdentify phase for a single CVE.""" + YES = "yes" + NO = "no" + UNKNOWN = "unknown" + +class PackageIdentifyResult(BaseModel): + """Result of the PackageIdentify phase for a single CVE.""" + affected_rpm_list: list[str] = [] + fixed_rpm_list: list[str] = [] + + is_target_package_affected: EnumIdentifyResult = EnumIdentifyResult.UNKNOWN + is_target_package_fixed: EnumIdentifyResult = EnumIdentifyResult.UNKNOWN + + conclusion_reason: str = Field( + default="", + description="Detailed explanation of why the package was determined to be vulnerable or not vulnerable" + ) + + + +class AcquiredArtifacts(BaseModel): + """Resolved file locations populated by source_acquisition, consumed by downstream checker nodes.""" + srpm_path: Path | None = None + source_dir: Path | None = None + build_log_path: Path | None = None + binary_rpm_path: Path | None = None + patch_source_dir: Path | None = None + patch_diff_path: Path | None = None + source_url: str | None = None + + +class VulnerabilityIntel(BaseModel): + """Structured intelligence extracted from CVE advisories and patches. + + Used to provide grep-ready patterns and context for L1 agent source searches. + """ + + affected_files: list[str] = Field( + default_factory=list, + description="Source file paths likely to contain vulnerable code" + ) + vulnerable_functions: list[str] = Field( + default_factory=list, + description="Function names that contain or handle the vulnerability" + ) + vulnerable_variables: list[str] = Field( + default_factory=list, + description="Variable names involved in the vulnerability" + ) + vulnerable_patterns: list[str] = Field( + default_factory=list, + description="Code patterns/snippets indicating vulnerable code (from - lines)" + ) + fix_patterns: list[str] = Field( + default_factory=list, + description="Code patterns/snippets indicating fixed code (from + lines)" + ) + root_cause: str = Field( + default="", + description="Technical explanation of why the code is vulnerable" + ) + vulnerability_type: str = Field( + default="", + description="Category: buffer_overflow, integer_overflow, use_after_free, null_deref, etc." + ) + search_keywords: list[str] = Field( + default_factory=list, + description="Recommended grep patterns ordered by specificity (most specific first)" + ) + affected_bitness: Literal["32-bit", "64-bit", "both"] = Field( + default="both", + description="Which bitness is affected: 32-bit only, 64-bit only, or both (default)" + ) + affected_architectures: list[str] | None = Field( + default=None, + description="CPU families affected (e.g., ['x86', 'arm']). None means all architectures." + ) + is_downstream_patch_available: bool = Field( + default=False, + description="True if a CVE-specific patch file exists in the downstream package" + ) + is_patch_applied_in_build: bool = Field( + default=False, + description="True if the patch was confirmed applied in build logs" + ) + patch_file_name: str = Field( + default="", + description="Name of the CVE-specific patch file (if available)" + ) + known_mitigations: str = Field( + default="", + description="Vendor-provided mitigations from RHSA or other intel sources (e.g., compiler flags, config changes)" + ) + + def format_for_prompt(self) -> str: + """Format VulnerabilityIntel for injection into L1 agent runtime prompt. + + Uses UPPERCASE labels so they can be referenced as anchors in thought prompts. + """ + lines = [] + if self.is_downstream_patch_available: + status = "APPLIED" if self.is_patch_applied_in_build else "AVAILABLE" + lines.append(f"DOWNSTREAM_PATCH_STATUS: {status}") + if self.patch_file_name: + lines.append(f"PATCH_FILE: {self.patch_file_name}") + if self.affected_files: + lines.append(f"AFFECTED_FILES: {', '.join(self.affected_files)}") + if self.vulnerable_functions: + lines.append(f"VULNERABLE_FUNCTIONS: {', '.join(self.vulnerable_functions)}") + if self.vulnerable_variables: + lines.append(f"VULNERABLE_VARIABLES: {', '.join(self.vulnerable_variables)}") + if self.vulnerable_patterns: + lines.append("VULNERABLE_PATTERNS:") + for p in self.vulnerable_patterns: + lines.append(f" - {p}") + if self.fix_patterns: + lines.append("FIX_PATTERNS:") + for p in self.fix_patterns: + lines.append(f" - {p}") + if self.search_keywords: + lines.append(f"SEARCH_KEYWORDS: {', '.join(self.search_keywords)}") + if self.root_cause: + lines.append(f"ROOT_CAUSE: {self.root_cause}") + if self.affected_bitness and self.affected_bitness != "both": + lines.append(f"AFFECTED_BITNESS: {self.affected_bitness}") + if self.affected_architectures: + lines.append(f"AFFECTED_ARCHITECTURES: {', '.join(self.affected_architectures)}") + if self.known_mitigations: + lines.append(f"KNOWN_MITIGATIONS: {self.known_mitigations}") + return "\n".join(lines) + + +class L1InvestigationResult(BaseModel): + """Intermediate result from L1 investigation, input to L2 or report generation.""" + downstream_report: dict[str, Any] | None = Field( + default=None, + description="Serialized DownstreamSearchReport from L1 investigation", + ) + upstream_report: dict[str, Any] | None = Field( + default=None, + description="Serialized UpstreamSearchReport from L1 investigation", + ) + l1_agent_answer: str | None = Field( + default=None, + description="Final answer from the L1 ReAct agent", + ) + vulnerability_intel: VulnerabilityIntel | None = Field( + default=None, + description="Structured vulnerability intelligence extracted from CVE advisories and patches", + ) + preliminary_verdict: Literal["vulnerable", "protected", "not_present", "uncertain"] = Field( + default="uncertain", + description="L1 verdict before L2 refinement", + ) + confidence: float = Field( + default=0.0, + ge=0.0, + le=1.0, + description="Confidence in the preliminary verdict", + ) + + +class L2BuildResult(BaseModel): + """Result from L2 Build Agent (BuildCompilationCheck + HardeningCheck).""" + compilation_status: Literal["compiled", "not_compiled", "unknown"] = Field( + default="unknown", + description="Whether vulnerable code is compiled into the binary", + ) + compilation_confidence: float = Field( + default=0.0, + ge=0.0, + le=1.0, + description="Confidence in compilation status", + ) + compilation_evidence: str | None = Field( + default=None, + description="Evidence supporting compilation status", + ) + hardening_relevant: bool | None = Field( + default=None, + description="Whether detected hardening flags are relevant to the CVE", + ) + hardening_flags: list[str] = Field( + default_factory=list, + description="Hardening flags detected in build log or binary", + ) + hardening_rationale: str | None = Field( + default=None, + description="Rationale for hardening relevance judgment", + ) + l2_override_verdict: Literal["not_vulnerable", "vulnerable_mitigated", None] = Field( + default=None, + description="L2 verdict override (if any)", + ) + evidence_sources: list[str] = Field( + default_factory=list, + description="Sources used for analysis: 'build_log', 'spec_file', 'build_system_files', 'binary'", + ) + + +class PackageCheckerContext(BaseModel): + """Consolidates all checker-specific state on AgentMorpheusInfo.""" + status: PackageCheckerStatus | None = None + source_key: str | None = None + artifacts: AcquiredArtifacts = Field(default_factory=AcquiredArtifacts) + identify_result: PackageIdentifyResult = Field(default_factory=PackageIdentifyResult) + l1_result: L1InvestigationResult | None = Field( + default=None, + description="Result from L1 Code Agent investigation", + ) + l2_result: L2BuildResult | None = Field( + default=None, + description="Result from L2 Build Agent (optional)", + ) diff --git a/src/exploit_iq_commons/data_models/common.py b/src/exploit_iq_commons/data_models/common.py index 077a98fa9..db848f4ad 100644 --- a/src/exploit_iq_commons/data_models/common.py +++ b/src/exploit_iq_commons/data_models/common.py @@ -28,6 +28,17 @@ class AnalysisType(str, Enum): IMAGE = "image" SOURCE = "source" + +class PipelineMode(str, Enum): + """ + Controls which investigation path the pipeline takes after process_sbom. + Orthogonal to AnalysisType (input format) -- any combination is valid. + """ + FULL_PIPELINE = "full_pipeline" + PACKAGE_CHECKER = "rpm_package_checker" + + + class HashableModel(BaseModel): """ Subclass of a Pydantic BaseModel that is hashable. Use in objects that need to be hashed for caching purposes. @@ -50,7 +61,15 @@ def __ne__(self, other): def __gt__(self, other): return self.__hash__() > other.__hash__() - +class TargetPackage(HashableModel): + """ + A package to investigate. + """ + name: str + version: str | None = None + release: str | None = None # e.g. "1.el8_2.3" (needed for Brew NVR lookup) + arch: str = "x86_64" # e.g. "x86_64", "aarch64", "s390x", "noarch" + class TypedBaseModel(BaseModel, typing.Generic[_LT]): """ Subclass of Pydantic BaseModel that allows for specifying the object type. Use in Pydantic discriminated unions. diff --git a/src/exploit_iq_commons/data_models/cve_intel.py b/src/exploit_iq_commons/data_models/cve_intel.py index 8050ffe26..33f92aecb 100644 --- a/src/exploit_iq_commons/data_models/cve_intel.py +++ b/src/exploit_iq_commons/data_models/cve_intel.py @@ -110,6 +110,7 @@ class Configuration(BaseModel): cvss_vector: str | None = None cvss_base_score: float | None = None cvss_severity: str | None = None + cwe_id: str | None = None cwe_name: str | None = None cwe_description: str | None = None cwe_extended_description: str | None = None @@ -185,8 +186,8 @@ class CVSSV3(BaseModel): class BaseMetricV3(BaseModel): cvssV3: "CVSSV3" - exploitabilityScore: float - impactScore: float + exploitabilityScore: float | None = None + impactScore: float | None = None class Impact(BaseModel): baseMetricV3: "BaseMetricV3" @@ -197,6 +198,10 @@ class Impact(BaseModel): priority: str | None = None ubuntu_description: str | None = None impact: Impact | None = None + patches: dict[str, list[str]] | None = Field( + default=None, + description="Map of package name to patch refs (e.g., 'upstream: https://github.com/.../commit/...')" + ) @property def description_fields(self): diff --git a/src/exploit_iq_commons/data_models/info.py b/src/exploit_iq_commons/data_models/info.py index a01f1dda7..4f7bd1ef1 100644 --- a/src/exploit_iq_commons/data_models/info.py +++ b/src/exploit_iq_commons/data_models/info.py @@ -15,6 +15,7 @@ from pydantic import BaseModel +from exploit_iq_commons.data_models.checker_status import PackageCheckerContext from exploit_iq_commons.data_models.cve_intel import CveIntel from exploit_iq_commons.data_models.dependencies import VulnerableDependencies @@ -62,3 +63,4 @@ class SBOMInfo(BaseModel): intel: list[CveIntel] | None = None sbom: SBOMInfo | None = None vulnerable_dependencies: list[VulnerableDependencies] | None = None + checker_context: PackageCheckerContext | None = None diff --git a/src/exploit_iq_commons/data_models/input.py b/src/exploit_iq_commons/data_models/input.py index 897c915d1..77a325214 100644 --- a/src/exploit_iq_commons/data_models/input.py +++ b/src/exploit_iq_commons/data_models/input.py @@ -25,12 +25,14 @@ from pydantic import Field from pydantic import Tag from pydantic import field_validator +from pydantic import model_validator from exploit_iq_commons.utils.string_utils import is_valid_cve_id from exploit_iq_commons.utils.string_utils import is_valid_ghsa_id from exploit_iq_commons.utils.dep_tree import Ecosystem from exploit_iq_commons.data_models.common import AnalysisType from exploit_iq_commons.data_models.common import HashableModel +from exploit_iq_commons.data_models.common import PipelineMode , TargetPackage from exploit_iq_commons.data_models.common import TypedBaseModel from exploit_iq_commons.data_models.info import AgentMorpheusInfo from exploit_iq_commons.data_models.info import SBOMPackage @@ -168,9 +170,25 @@ class ImageInfoInput(HashableModel): - "source": Analysis of source code and commitId without SBOM data """ - source_info: list[SourceDocumentsInfo] + pipeline_mode: PipelineMode = PipelineMode.FULL_PIPELINE + """ + Controls which investigation path the pipeline takes after process_sbom: + - "full_pipeline": Full transitive analysis (check_vuln_deps -> llm_engine) + - "package_checker": Focused package vulnerability checker (package_checker -> checker_output) + """ + target_package: TargetPackage | None = None + + source_info: list[SourceDocumentsInfo] = [] sbom_info: SBOMInfoInput | None = None + @model_validator(mode="after") + def validate_pipeline_requirements(self) -> "ImageInfoInput": + if self.pipeline_mode == PipelineMode.PACKAGE_CHECKER and self.target_package is None: + raise ValueError("target_package is required when pipeline_mode is PACKAGE_CHECKER") + if self.pipeline_mode == PipelineMode.FULL_PIPELINE and not self.source_info: + raise ValueError("source_info is required and must not be empty when pipeline_mode is FULL_PIPELINE") + return self + @field_validator('source_info', mode='after') @classmethod def check_conflicting_refs(cls, source_info: list[SourceDocumentsInfo]) -> list[SourceDocumentsInfo]: diff --git a/src/exploit_iq_commons/utils/functions_parsers/python_functions_parser.py b/src/exploit_iq_commons/utils/functions_parsers/python_functions_parser.py index fe7eeec7d..8aa46f681 100644 --- a/src/exploit_iq_commons/utils/functions_parsers/python_functions_parser.py +++ b/src/exploit_iq_commons/utils/functions_parsers/python_functions_parser.py @@ -164,8 +164,12 @@ def search_for_called_function(self, caller_function: Document, callee_function_ if len(parts) == 1: identifier = parts[0] identifier = identifier.rstrip('(') - callee_function = code_documents[callee_function_file_name] - callee_function_package = self.get_package_names(callee_function)[0] + try: + callee_function_doc = code_documents[callee_function_file_name] + callee_function_package = self.get_package_names(callee_function_doc)[0] + except KeyError: + # Third-party package without source files - use file name as package name + callee_function_package = callee_function_file_name caller_function_package = self.get_package_names(caller_function)[0] if callee_function_package == caller_function_package: diff --git a/src/exploit_iq_commons/utils/hardening_kb.py b/src/exploit_iq_commons/utils/hardening_kb.py new file mode 100644 index 000000000..22fe6c789 --- /dev/null +++ b/src/exploit_iq_commons/utils/hardening_kb.py @@ -0,0 +1,180 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Hardening Knowledge Base utilities. + +Loads the hardening_kb.json file containing compiler/linker flags that mitigate +specific CWE vulnerability categories. Provides lookup by CWE ID to retrieve +relevant hardening flags and their descriptions for LLM context. +""" + +from __future__ import annotations + +import json +import threading +from pathlib import Path + +from pydantic import BaseModel, Field + +from exploit_iq_commons.logging.loggers_factory import LoggingFactory + +logger = LoggingFactory.get_agent_logger(__name__) + + +class HardeningEntry(BaseModel): + """A single hardening flag entry from the knowledge base.""" + + flag: str = Field(description="Compiler/linker flag(s) for hardening") + flag_type: str = Field(description="Type: runtime, linker, warning, optimization, architecture") + description: str = Field(description="Description of what the flag does") + vulnerability_category: str = Field(description="Category of vulnerability this mitigates") + cwe_ids: list[str] = Field(default_factory=list, description="CWE IDs this flag helps mitigate") + + +# Flag types that provide actual runtime mitigation (not just warnings or optimization changes) +MITIGATING_FLAG_TYPES = frozenset({"runtime", "linker"}) + + +class HardeningKB: + """In-memory cache for hardening flags knowledge base. + + Implements singleton pattern to ensure single instance across the application. + Provides lookup by CWE ID to find relevant hardening flags. + """ + + _instance = None + _lock = threading.Lock() + + def __new__(cls): + if cls._instance is None: + with cls._lock: + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __init__(self) -> None: + if not hasattr(self, '_initialized'): + base_path = Path(__file__).resolve().parents[1] + self.json_path = base_path / "data" / "hardening_kb" / "hardening_kb.json" + + self._entries: list[HardeningEntry] = [] + self._cwe_index: dict[str, list[HardeningEntry]] = {} + self._initialized = True + self._load() + + @classmethod + def get_instance(cls) -> "HardeningKB": + """Get the singleton instance of HardeningKB.""" + return cls() + + def _load(self) -> None: + """Load the hardening KB JSON and build the CWE index.""" + try: + data = json.loads(self.json_path.read_text(encoding="utf-8")) + except FileNotFoundError: + logger.warning("Hardening KB JSON not found at %s", self.json_path) + return + except json.JSONDecodeError as exc: + logger.error("Failed to parse hardening KB JSON: %s", exc) + return + + mappings = data.get("mappings", []) + for mapping in mappings: + try: + entry = HardeningEntry( + flag=mapping.get("flag", "").strip(), + flag_type=mapping.get("flag_type", "unknown"), + description=mapping.get("description", ""), + vulnerability_category=mapping.get("vulnerability_category", ""), + cwe_ids=mapping.get("cwe_ids", []), + ) + self._entries.append(entry) + + for cwe_id in entry.cwe_ids: + normalized = self._normalize_cwe_id(cwe_id) + if normalized: + if normalized not in self._cwe_index: + self._cwe_index[normalized] = [] + self._cwe_index[normalized].append(entry) + + except Exception as exc: + logger.warning("Failed to parse hardening entry: %s - %s", mapping, exc) + + logger.info( + "Loaded hardening KB: %d entries, %d unique CWE mappings", + len(self._entries), + len(self._cwe_index), + ) + + @staticmethod + def _normalize_cwe_id(cwe_id: str) -> str | None: + """Normalize CWE ID to uppercase format (e.g., 'CWE-121'). + + Returns None for special values like 'N/A' or 'Multi'. + """ + if not cwe_id: + return None + cwe_id = cwe_id.strip().upper() + if cwe_id in ("N/A", "MULTI"): + return None + if not cwe_id.startswith("CWE-"): + cwe_id = f"CWE-{cwe_id}" + return cwe_id + + def lookup_by_cwe( + self, + cwe_id: str | None, + include_non_mitigating: bool = False, + ) -> list[HardeningEntry]: + """Return hardening entries that match the given CWE ID. + + By default, only returns flags that provide actual runtime mitigation + (flag_type: runtime, linker). Warning-only and optimization flags are + excluded since they don't mitigate vulnerabilities at runtime. + + Args: + cwe_id: The CWE identifier (e.g., 'CWE-121' or '121') + include_non_mitigating: If True, include warning/optimization flags + that don't provide runtime mitigation (for auditing purposes) + + Returns: + List of HardeningEntry objects that help mitigate this CWE + """ + if not cwe_id: + return [] + + normalized = self._normalize_cwe_id(cwe_id) + if not normalized: + return [] + + entries = self._cwe_index.get(normalized, []) + + if not include_non_mitigating: + entries = [e for e in entries if e.flag_type in MITIGATING_FLAG_TYPES] + + logger.debug( + "HardeningKB lookup for %s: found %d entries (include_non_mitigating=%s)", + normalized, + len(entries), + include_non_mitigating, + ) + return list(entries) + + def get_all_entries(self) -> list[HardeningEntry]: + """Return all hardening entries in the knowledge base.""" + return list(self._entries) + + diff --git a/src/exploit_iq_commons/utils/source_rpm_downloader.py b/src/exploit_iq_commons/utils/source_rpm_downloader.py index 9022b40de..01fa0a240 100644 --- a/src/exploit_iq_commons/utils/source_rpm_downloader.py +++ b/src/exploit_iq_commons/utils/source_rpm_downloader.py @@ -16,7 +16,6 @@ import os from pathlib import Path import gzip -import tarfile import xml.etree.ElementTree as ET import subprocess from concurrent.futures import ThreadPoolExecutor, as_completed @@ -39,17 +38,12 @@ class RepoUrl: def extract_archives_in_folder(folder: str): """ - Extracts all .tar.* files in the given folder to subdirectories. - Supports .tar.gz, .tgz, .tar.xz, .txz, .tar.bz2, .tbz2, and .tar. + Extracts all .tar.* files in the given folder to subdirectories using bsdtar. + Supports .tar.gz, .tgz, .tar.xz, .txz, .tar.bz2, .tbz2, .tar.lzma, .tlz, and .tar. :param folder: Path to the folder to scan. """ - supported_extensions = { - '.tar.gz', '.tgz', - '.tar.xz', '.txz', - '.tar.bz2', '.tbz2', - '.tar' - } + shorthand_extensions = ('.tar', '.tgz', '.txz', '.tbz', '.tbz2', '.tlz') folder_path = Path(folder) @@ -57,25 +51,34 @@ def extract_archives_in_folder(folder: str): if not file.is_file(): continue - suffix = ''.join(file.suffixes[-2:]) if len(file.suffixes) >= 2 else file.suffix - if suffix not in supported_extensions and file.suffix not in supported_extensions: + name_lower = file.name.lower() + is_tarball = '.tar.' in name_lower or name_lower.endswith(shorthand_extensions) + if not is_tarball: continue - # Determine full suffix like .tar.gz or .tgz - full_suffix = suffix if suffix in supported_extensions else file.suffix + suffix = ''.join(file.suffixes[-2:]) if len(file.suffixes) >= 2 else file.suffix + full_suffix = suffix if suffix.startswith('.tar') else file.suffix # Determine output directory name output_dir = file.with_suffix('').with_suffix('').stem if full_suffix.startswith('.tar') else file.stem output_path = folder_path / output_dir logger.debug(f"Extracting: {file.name} → {output_path}") + if output_path.exists(): + logger.debug(f"Output path already exists: {output_path}") + return output_path.mkdir(exist_ok=True) try: - with tarfile.open(file, 'r:*') as tar: - tar.extractall(path=output_path) + result = subprocess.run( + ['bsdtar', '-xf', str(file), '-C', str(output_path)], + capture_output=True, + text=True + ) + if result.returncode != 0: + logger.error(f"Failed to extract {file.name}: {result.stderr.strip()}") except Exception as e: - logger.debug(f"Failed to extract {file.name}: {e}") + logger.error(f"Could not run extraction for {file.name}: {e}") class RPMDependencyManager: """ @@ -476,7 +479,8 @@ def parse_sbom(self): logger.info(f"Found {len(packages)} packages in SBOM, platform: {platform_version}") return packages, platform_version - def extract_src_rpm(self, rpm_path: Path, extract_dir: Path): + @staticmethod + def extract_src_rpm(rpm_path: Path, extract_dir: Path): #logger.info(f" Extracting {rpm_path.name} to {extract_dir} ...") extract_dir.mkdir(parents=True, exist_ok=True) try: diff --git a/src/vuln_analysis/configs/brew/external-user-profile.yml b/src/vuln_analysis/configs/brew/external-user-profile.yml new file mode 100644 index 000000000..3d0dd1984 --- /dev/null +++ b/src/vuln_analysis/configs/brew/external-user-profile.yml @@ -0,0 +1,23 @@ +# External User Profile — public Fedora Koji (no VPN / no auth) +# +# Assumptions: +# - Packages are resolved and downloaded from koji.fedoraproject.org +# - Build logs are not fetched (optional for internal; unavailable or unused for external v1) + +profile: + name: fedora-public + +hosts: + rpm: + brew_hub: https://koji.fedoraproject.org/kojihub + brew_download: https://kojipkgs.fedoraproject.org + +default_arch: x86_64 + +# Dev/local phase: false (matches internal profile). Set to true before cluster deploy. +ssl_verify: true + +build_log: + auto_fetch: false + +download_binary_rpm: false diff --git a/src/vuln_analysis/configs/brew/internal-user-profile.yml b/src/vuln_analysis/configs/brew/internal-user-profile.yml new file mode 100644 index 000000000..35df8e035 --- /dev/null +++ b/src/vuln_analysis/configs/brew/internal-user-profile.yml @@ -0,0 +1,26 @@ +# Internal User Profile — Red Hat VPN-connected environment +# +# Assumptions: +# - User is on the Red Hat VPN (can reach *.redhat.com internal hosts) +# - Build logs are available via Brew task output + +profile: + name: redhat-internal + +hosts: + rpm: + brew_hub: https://brewhub.engineering.redhat.com/brewhub + brew_download: https://download-01.beak-001.prod.iad2.dc.redhat.com/brewroot + git: + dist_git: https://pkgs.devel.redhat.com/cgit + +default_arch: x86_64 + +ssl_verify: true +# RH engineering hosts (brewhub, brew CDN) — use git-ca-bundle, not service-ca.crt +verify_path: /app/git-ca-bundle/ca-bundle.crt + +build_log: + auto_fetch: true + +download_binary_rpm: false diff --git a/src/vuln_analysis/configs/config-http-openai.yml b/src/vuln_analysis/configs/config-http-openai.yml index 80c16d7b5..1ee69cfb4 100644 --- a/src/vuln_analysis/configs/config-http-openai.yml +++ b/src/vuln_analysis/configs/config-http-openai.yml @@ -80,6 +80,11 @@ functions: Code Keyword Search: _type: lexical_code_search top_k: 5 + Source Grep: + _type: source_grep + base_checker_dir: .cache/am_cache/checker + max_results: 50 + context_lines: 2 CVE Web Search: _type: serp_wrapper max_retries: 5 @@ -157,6 +162,37 @@ functions: generate_intel_score: true intel_low_score: 51 insist_analysis: false + cve_source_acquisition: + _type: cve_source_acquisition + base_git_dir: .cache/am_cache/git + base_pickle_dir: .cache/am_cache/pickle + base_rpm_dir: .cache/am_cache/rpms + rpm_user_type: ${RPM_USER_TYPE:-internal} + cve_checker_segmentation: + _type: cve_checker_segmentation + base_checker_dir: .cache/am_cache/checker + base_code_index_dir: .cache/am_cache/code_index + cve_package_code_agent: + _type: cve_package_code_agent + llm_name: cve_agent_executor_llm + base_checker_dir: .cache/am_cache/checker + base_code_index_dir: .cache/am_cache/code_index + rpm_user_type: ${RPM_USER_TYPE:-internal} + tool_names: + - Source Grep + - Code Keyword Search + cve_checker_report: + _type: cve_checker_report + llm_name: cve_agent_executor_llm + base_checker_dir: .cache/am_cache/checker + cve_build_agent: + _type: cve_build_agent + llm_name: cve_agent_executor_llm + base_checker_dir: .cache/am_cache/checker + max_iterations: 10 + tool_names: + - Source Grep + - Code Keyword Search health_check: _type: health_check @@ -248,6 +284,11 @@ workflow: cve_summarize_name: cve_summarize cve_justify_name: cve_justify cve_output_config_name: cve_http_output + cve_source_acquisition_name: cve_source_acquisition + cve_checker_segmentation_name: cve_checker_segmentation + cve_package_code_agent_name: cve_package_code_agent + cve_checker_report_name: cve_checker_report + cve_build_agent_name: cve_build_agent eval: general: diff --git a/src/vuln_analysis/configs/openapi/openapi.json b/src/vuln_analysis/configs/openapi/openapi.json index 79feca4cc..e0cdb1ad2 100644 --- a/src/vuln_analysis/configs/openapi/openapi.json +++ b/src/vuln_analysis/configs/openapi/openapi.json @@ -1,3043 +1,4602 @@ { "openapi": "3.1.0", "info": { - "title": "FastAPI", - "version": "0.1.0" + "title": "FastAPI", + "version": "0.1.0" }, "paths": { - "/generate": { - "post": { - "summary": "Post Single", - "description": "Executes the default AIQ Toolkit workflow from the loaded configuration", - "operationId": "post_single_generate_post", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/AgentMorpheusInput-Input" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/AgentMorpheusOutput" - } - } - } - }, - "500": { - "description": "Internal Server Error", - "content": { - "application/json": { - "example": { - "detail": "Internal server error occurred" - } - } - } - }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } - } + "/generate": { + "post": { + "summary": "Post Single", + "description": "Executes the default NAT workflow from the loaded configuration", + "operationId": "post_single_generate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/AgentMorpheusInput-Input" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/AgentMorpheusOutput" + } + } + } + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "example": { + "detail": "Internal server error occurred" + } } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } } - }, - "/generate/stream": { - "post": { - "summary": "Post Stream", - "description": "Executes the default AIQ Toolkit workflow from the loaded configuration", - "operationId": "post_stream_generate_stream_post", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/AgentMorpheusInput-Input" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/AgentMorpheusOutput" - } - } - } - }, - "500": { - "description": "Internal Server Error", - "content": { - "application/json": { - "example": { - "detail": "Internal server error occurred" - } - } - } - }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } - } + } + } + }, + "/generate/stream": { + "post": { + "summary": "Post Stream", + "description": "Executes the default NAT workflow from the loaded configuration", + "operationId": "post_stream_generate_stream_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/AgentMorpheusInput-Input" } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/AgentMorpheusOutput" + } + } + } + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "example": { + "detail": "Internal server error occurred" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } } - }, - "/generate/full": { - "post": { - "summary": "Post Stream", - "description": "Stream raw intermediate steps without any step adaptor translations.\nUse filter_steps query parameter to filter steps by type (comma-separated list) or set to 'none' to suppress all intermediate steps.", - "operationId": "post_stream_generate_full_post", - "parameters": [ - { - "name": "filter_steps", - "in": "query", - "required": false, - "schema": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Filter Steps" - } - } + } + } + }, + "/generate/full": { + "post": { + "summary": "Post Stream", + "description": "Stream raw intermediate steps without any step adaptor translations.\nUse filter_steps query parameter to filter steps by type (comma-separated list) or set to 'none' to suppress all intermediate steps.", + "operationId": "post_stream_generate_full_post", + "parameters": [ + { + "name": "filter_steps", + "in": "query", + "required": false, + "schema": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } ], - "requestBody": { - "required": true, - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/AgentMorpheusInput-Input" - } - } - } - }, - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/AgentMorpheusOutput" - } - } - } - }, - "500": { - "description": "Internal Server Error", - "content": { - "application/json": { - "example": { - "detail": "Internal server error occurred" - } - } - } - }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } - } + "title": "Filter Steps" + } + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/AgentMorpheusInput-Input" } + } } - }, - "/chat": { - "post": { - "summary": "Post Single", - "description": "Executes the default AIQ Toolkit workflow from the loaded configuration", - "operationId": "post_single_chat_post", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/AIQChatRequest" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/AIQChatResponse" - } - } - } - }, - "500": { - "description": "Internal Server Error", - "content": { - "application/json": { - "example": { - "detail": "Internal server error occurred" - } - } - } - }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } - } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/AgentMorpheusOutput" + } + } + } + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "example": { + "detail": "Internal server error occurred" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } } + } } - }, - "/chat/stream": { - "post": { - "summary": "Post Stream", - "description": "Executes the default AIQ Toolkit workflow from the loaded configuration", - "operationId": "post_stream_chat_stream_post", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/AIQChatRequest" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "anyOf": [ - { - "$ref": "#/components/schemas/AIQChatResponseChunk" - }, - { - "$ref": "#/components/schemas/AIQResponseIntermediateStep" - } - ], - "title": "Response Post Stream Chat Stream Post" - } - } - } - }, - "500": { - "description": "Internal Server Error", - "content": { - "application/json": { - "example": { - "detail": "Internal server error occurred" - } - } - } - }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } - } + } + } + }, + "/generate/async": { + "post": { + "summary": "Start Async Generation", + "description": "Start an async generate job", + "operationId": "start_async_generation_generate_async_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/AsyncGenerateRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "anyOf": [ + { + "$ref": "#/components/schemas/AsyncGenerateResponse" + }, + { + "$ref": "#/components/schemas/AsyncGenerationStatusResponse" + } + ], + "title": "Response Start Async Generation Generate Async Post" + } + } + } + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "example": { + "detail": "Internal server error occurred" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } } + } } - }, - "/evaluate/job/last": { - "get": { - "summary": "Get Last Job Status", - "description": "Get the status of the last created evaluation job", - "operationId": "get_last_job_status_evaluate_job_last_get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/AIQEvaluateStatusResponse" - } - } - } - }, - "404": { - "description": "No jobs found" - }, - "500": { - "description": "Internal Server Error", - "content": { - "application/json": { - "example": { - "detail": "Internal server error occurred" - } - } - } - } + } + } + }, + "/generate/async/job/{job_id}": { + "get": { + "summary": "Get Async Job Status", + "description": "Get the status of an async job", + "operationId": "get_async_job_status_generate_async_job__job_id__get", + "parameters": [ + { + "name": "job_id", + "in": "path", + "required": true, + "schema": { + "type": "string", + "title": "Job Id" + } + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/AsyncGenerationStatusResponse" + } + } + } + }, + "404": { + "description": "Job not found" + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "example": { + "detail": "Internal server error occurred" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } } + } } - }, - "/evaluate/job/{job_id}": { - "get": { - "summary": "Get Job Status", - "description": "Get the status of an evaluation job", - "operationId": "get_job_status_evaluate_job__job_id__get", - "parameters": [ - { - "name": "job_id", - "in": "path", - "required": true, - "schema": { - "type": "string", - "title": "Job Id" - } - } + } + } + }, + "/chat": { + "post": { + "summary": "Post Single", + "description": "Executes the default NAT workflow from the loaded configuration", + "operationId": "post_single_chat_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ChatRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ChatResponse" + } + } + } + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "example": { + "detail": "Internal server error occurred" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/chat/stream": { + "post": { + "summary": "Post Stream", + "description": "Executes the default NAT workflow from the loaded configuration", + "operationId": "post_stream_chat_stream_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ChatRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "anyOf": [ + { + "$ref": "#/components/schemas/ChatResponseChunk" + }, + { + "$ref": "#/components/schemas/ResponseIntermediateStep" + } + ], + "title": "Response Post Stream Chat Stream Post" + } + } + } + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "example": { + "detail": "Internal server error occurred" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/v1/chat/completions": { + "post": { + "summary": "Post Openai Api Compatible", + "description": "Executes the default NAT workflow from the loaded configuration (OpenAI Chat Completions API compatible)", + "operationId": "post_openai_api_compatible_v1_chat_completions_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ChatRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "anyOf": [ + { + "$ref": "#/components/schemas/ChatResponse" + }, + { + "$ref": "#/components/schemas/ChatResponseChunk" + } + ], + "title": "Response Post Openai Api Compatible V1 Chat Completions Post" + } + } + } + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "example": { + "detail": "Internal server error occurred" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/evaluate/job/last": { + "get": { + "summary": "Get Last Job Status", + "description": "Get the status of the last created evaluation job", + "operationId": "get_last_job_status_evaluate_job_last_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateStatusResponse" + } + } + } + }, + "404": { + "description": "No jobs found" + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "example": { + "detail": "Internal server error occurred" + } + } + } + } + } + } + }, + "/evaluate/job/{job_id}": { + "get": { + "summary": "Get Job Status", + "description": "Get the status of an evaluation job", + "operationId": "get_job_status_evaluate_job__job_id__get", + "parameters": [ + { + "name": "job_id", + "in": "path", + "required": true, + "schema": { + "type": "string", + "title": "Job Id" + } + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateStatusResponse" + } + } + } + }, + "404": { + "description": "Job not found" + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "example": { + "detail": "Internal server error occurred" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/evaluate/jobs": { + "get": { + "summary": "Get Jobs", + "description": "Get all jobs, optionally filtered by status", + "operationId": "get_jobs_evaluate_jobs_get", + "parameters": [ + { + "name": "status", + "in": "query", + "required": false, + "schema": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } ], - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/AIQEvaluateStatusResponse" - } - } - } - }, - "404": { - "description": "Job not found" - }, - "500": { - "description": "Internal Server Error", - "content": { - "application/json": { - "example": { - "detail": "Internal server error occurred" - } - } - } - }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } - } + "title": "Status" + } + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { + "$ref": "#/components/schemas/EvaluateStatusResponse" + }, + "title": "Response Get Jobs Evaluate Jobs Get" + } + } + } + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "example": { + "detail": "Internal server error occurred" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } } + } } - }, - "/evaluate/jobs": { - "get": { - "summary": "Get Jobs", - "description": "Get all jobs, optionally filtered by status", - "operationId": "get_jobs_evaluate_jobs_get", - "parameters": [ - { - "name": "status", - "in": "query", - "required": false, - "schema": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Status" - } - } + } + } + }, + "/evaluate": { + "post": { + "summary": "Start Evaluation", + "description": "Evaluates the performance and accuracy of the workflow on a dataset", + "operationId": "start_evaluation_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateResponse" + } + } + } + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "example": { + "detail": "Internal server error occurred" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/auth/redirect": { + "get": { + "summary": "Redirect Uri", + "description": "Handles the authorization code and state returned from the Authorization Code Grant Flow.", + "operationId": "redirect_uri_auth_redirect_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + + } + } + } + } + } + } + }, + "/health": { + "get": { + "summary": "Get Single", + "description": "Perform a health check.", + "operationId": "get_single_health_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HealthStatusResponse" + } + } + } + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "example": { + "detail": "Internal server error occurred" + } + } + } + } + } + } + }, + "/health/stream": { + "get": { + "summary": "Get Stream", + "description": "Perform a health check.", + "operationId": "get_stream_health_stream_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HealthStatusResponse" + } + } + } + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "example": { + "detail": "Internal server error occurred" + } + } + } + } + } + } + }, + "/health/full": { + "get": { + "summary": "Get Stream", + "description": "Stream raw intermediate steps without any step adaptor translations.\nUse filter_steps query parameter to filter steps by type (comma-separated list) or set to 'none' to suppress all intermediate steps.", + "operationId": "get_stream_health_full_get", + "parameters": [ + { + "name": "filter_steps", + "in": "query", + "required": false, + "schema": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } ], - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "type": "array", - "items": { - "$ref": "#/components/schemas/AIQEvaluateStatusResponse" - }, - "title": "Response Get Jobs Evaluate Jobs Get" - } - } - } - }, - "500": { - "description": "Internal Server Error", - "content": { - "application/json": { - "example": { - "detail": "Internal server error occurred" - } - } - } - }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } - } + "title": "Filter Steps" + } + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } } + } } - }, - "/evaluate": { - "post": { - "summary": "Start Evaluation", - "description": "Evaluates the performance and accuracy of the workflow on a dataset", - "operationId": "start_evaluation_evaluate_post", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/AIQEvaluateRequest" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/AIQEvaluateResponse" - } - } - } - }, - "500": { - "description": "Internal Server Error", - "content": { - "application/json": { - "example": { - "detail": "Internal server error occurred" - } - } - } - }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } - } + } + } + }, + "/health/async/job/{job_id}": { + "get": { + "summary": "Get Async Job Status", + "description": "Get the status of an async job", + "operationId": "get_async_job_status_health_async_job__job_id__get", + "parameters": [ + { + "name": "job_id", + "in": "path", + "required": true, + "schema": { + "type": "string", + "title": "Job Id" + } + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/AsyncGenerationStatusResponse" + } + } + } + }, + "404": { + "description": "Job not found" + }, + "500": { + "description": "Internal Server Error", + "content": { + "application/json": { + "example": { + "detail": "Internal server error occurred" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } } + } } + } } + } }, "components": { - "schemas": { - "AIQChatRequest": { - "properties": { - "messages": { - "items": { - "$ref": "#/components/schemas/Message" - }, - "type": "array", - "title": "Messages" - }, - "model": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Model" - }, - "temperature": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ], - "title": "Temperature" - }, - "max_tokens": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ], - "title": "Max Tokens" - }, - "top_p": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ], - "title": "Top P" - } + "schemas": { + "AcquiredArtifacts": { + "properties": { + "srpm_path": { + "anyOf": [ + { + "type": "string", + "format": "path" }, - "additionalProperties": true, - "type": "object", - "required": [ - "messages" - ], - "title": "AIQChatRequest", - "description": "AIQChatRequest is a data model that represents a request to the AIQ Toolkit chat API." - }, - "AIQChatResponse": { - "properties": { - "id": { - "type": "string", - "title": "Id" - }, - "object": { - "type": "string", - "title": "Object" - }, - "model": { - "type": "string", - "title": "Model", - "default": "" - }, - "created": { - "type": "string", - "format": "date-time", - "title": "Created" - }, - "choices": { - "items": { - "$ref": "#/components/schemas/AIQChoice" - }, - "type": "array", - "title": "Choices" - }, - "usage": { - "anyOf": [ - { - "$ref": "#/components/schemas/AIQUsage" - }, - { - "type": "null" - } - ] - } + { + "type": "null" + } + ], + "title": "Srpm Path" + }, + "source_dir": { + "anyOf": [ + { + "type": "string", + "format": "path" }, - "additionalProperties": true, - "type": "object", - "required": [ - "id", - "object", - "created", - "choices" - ], - "title": "AIQChatResponse", - "description": "AIQChatResponse is a data model that represents a response from the AIQ Toolkit chat API." - }, - "AIQChatResponseChunk": { - "properties": { - "id": { - "type": "string", - "title": "Id" - }, - "choices": { - "items": { - "$ref": "#/components/schemas/AIQChoice" - }, - "type": "array", - "title": "Choices" - }, - "created": { - "type": "string", - "format": "date-time", - "title": "Created" - }, - "model": { - "type": "string", - "title": "Model", - "default": "" - }, - "object": { - "type": "string", - "title": "Object", - "default": "chat.completion.chunk" - } + { + "type": "null" + } + ], + "title": "Source Dir" + }, + "build_log_path": { + "anyOf": [ + { + "type": "string", + "format": "path" }, - "additionalProperties": true, - "type": "object", - "required": [ - "id", - "choices", - "created" - ], - "title": "AIQChatResponseChunk", - "description": "AIQChatResponseChunk is a data model that represents a response chunk from the AIQ Toolkit chat streaming API." + { + "type": "null" + } + ], + "title": "Build Log Path" }, - "AIQChoice": { - "properties": { - "message": { - "$ref": "#/components/schemas/AIQChoiceMessage" - }, - "finish_reason": { - "anyOf": [ - { - "type": "string", - "enum": [ - "stop", - "length", - "tool_calls", - "content_filter", - "function_call" - ] - }, - { - "type": "null" - } - ], - "title": "Finish Reason" - }, - "index": { - "type": "integer", - "title": "Index" - } + "binary_rpm_path": { + "anyOf": [ + { + "type": "string", + "format": "path" }, - "additionalProperties": true, - "type": "object", - "required": [ - "message", - "index" - ], - "title": "AIQChoice" - }, - "AIQChoiceMessage": { - "properties": { - "content": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Content" - }, - "role": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Role" - } + { + "type": "null" + } + ], + "title": "Binary Rpm Path" + }, + "patch_source_dir": { + "anyOf": [ + { + "type": "string", + "format": "path" }, - "type": "object", - "title": "AIQChoiceMessage" + { + "type": "null" + } + ], + "title": "Patch Source Dir" }, - "AIQEvaluateRequest": { - "properties": { - "config_file": { - "type": "string", - "title": "Config File", - "description": "Path to the configuration file for evaluation" - }, - "job_id": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Job Id", - "description": "Unique identifier for the evaluation job" - }, - "reps": { - "type": "integer", - "title": "Reps", - "description": "Number of repetitions for the evaluation, defaults to 1", - "default": 1 - }, - "expiry_seconds": { - "type": "integer", - "title": "Expiry Seconds", - "description": "Optional time (in seconds) before the job expires. Clamped between 600 (10 min) and 86400 (24h).", - "default": 3600 - } + "patch_diff_path": { + "anyOf": [ + { + "type": "string", + "format": "path" }, - "type": "object", - "required": [ - "config_file" - ], - "title": "AIQEvaluateRequest", - "description": "Request model for the evaluate endpoint." - }, - "AIQEvaluateResponse": { - "properties": { - "job_id": { - "type": "string", - "title": "Job Id", - "description": "Unique identifier for the evaluation job" - }, - "status": { - "type": "string", - "title": "Status", - "description": "Current status of the evaluation job" - } + { + "type": "null" + } + ], + "title": "Patch Diff Path" + }, + "source_url": { + "anyOf": [ + { + "type": "string" }, - "type": "object", - "required": [ - "job_id", - "status" - ], - "title": "AIQEvaluateResponse", - "description": "Response model for the evaluate endpoint." - }, - "AIQEvaluateStatusResponse": { - "properties": { - "job_id": { - "type": "string", - "title": "Job Id", - "description": "Unique identifier for the evaluation job" - }, - "status": { - "type": "string", - "title": "Status", - "description": "Current status of the evaluation job" - }, - "config_file": { - "type": "string", - "title": "Config File", - "description": "Path to the configuration file used for evaluation" - }, - "error": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Error", - "description": "Error message if the job failed" - }, - "output_path": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Output Path", - "description": "Path to the output file if the job completed successfully" - }, - "created_at": { - "type": "string", - "format": "date-time", - "title": "Created At", - "description": "Timestamp when the job was created" - }, - "updated_at": { - "type": "string", - "format": "date-time", - "title": "Updated At", - "description": "Timestamp when the job was last updated" - }, - "expires_at": { - "anyOf": [ - { - "type": "string", - "format": "date-time" - }, - { - "type": "null" - } - ], - "title": "Expires At", - "description": "Timestamp when the job will expire" - } + { + "type": "null" + } + ], + "title": "Source Url" + } + }, + "type": "object", + "title": "AcquiredArtifacts", + "description": "Resolved file locations populated by source_acquisition, consumed by downstream checker nodes." + }, + "AgentIntermediateStep": { + "properties": { + "tool_name": { + "type": "string", + "title": "Tool Name" + }, + "action_log": { + "type": "string", + "title": "Action Log" + }, + "tool_input": { + "anyOf": [ + { + "type": "string" }, - "type": "object", - "required": [ - "job_id", - "status", - "config_file", - "created_at", - "updated_at" - ], - "title": "AIQEvaluateStatusResponse", - "description": "Response model for the evaluate status endpoint." - }, - "AIQResponseIntermediateStep": { - "properties": { - "id": { - "type": "string", - "title": "Id" - }, - "parent_id": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Parent Id" - }, - "type": { - "type": "string", - "title": "Type", - "default": "markdown" - }, - "name": { - "type": "string", - "title": "Name" - }, - "payload": { - "type": "string", - "title": "Payload" - } + { + "type": "object" + } + ], + "title": "Tool Input" + }, + "tool_output": { + "title": "Tool Output" + } + }, + "type": "object", + "required": [ + "tool_name", + "action_log", + "tool_input", + "tool_output" + ], + "title": "AgentIntermediateStep", + "description": "Represents info for an intermediate step taken by an agent." + }, + "AgentMorpheusEngineOutput": { + "properties": { + "vuln_id": { + "type": "string", + "title": "Vuln Id" + }, + "checklist": { + "items": { + "$ref": "#/components/schemas/ChecklistItemOutput" + }, + "type": "array", + "title": "Checklist" + }, + "summary": { + "type": "string", + "title": "Summary" + }, + "justification": { + "$ref": "#/components/schemas/JustificationOutput" + }, + "intel_score": { + "type": "integer", + "title": "Intel Score" + }, + "cvss": { + "anyOf": [ + { + "$ref": "#/components/schemas/CVSSOutput" }, - "additionalProperties": true, - "type": "object", - "required": [ - "id", - "name", - "payload" - ], - "title": "AIQResponseIntermediateStep", - "description": "AIQResponseSerializedStep is a data model that represents a serialized step in the AIQ Toolkit chat streaming API." - }, - "AIQUsage": { - "properties": { - "prompt_tokens": { - "type": "integer", - "title": "Prompt Tokens" - }, - "completion_tokens": { - "type": "integer", - "title": "Completion Tokens" - }, - "total_tokens": { - "type": "integer", - "title": "Total Tokens" - } + { + "type": "null" + } + ] + } + }, + "type": "object", + "required": [ + "vuln_id", + "checklist", + "summary", + "justification", + "intel_score", + "cvss" + ], + "title": "AgentMorpheusEngineOutput", + "description": "Contains all output generated by the main Agent Morpheus LLM Engine for a given vulnerability.\n\n- vuln_id: the ID of the vulnerability being processed by the LLM engine.\n- checklist: a list of ChecklistItemOutput objects, each containing an input and a response from the LLM agent.\n- summary: a short summary of the checklist inputs and responses, generated by an LLM.\n- justification: a JustificationOutput object containing details of the model's justification decision.\n- cvss: a CVSSOutput object containing the CVSS score and vector string for the vulnerability." + }, + "AgentMorpheusInfo": { + "properties": { + "vdb": { + "anyOf": [ + { + "$ref": "#/components/schemas/VdbPaths" }, - "type": "object", - "required": [ - "prompt_tokens", - "completion_tokens", - "total_tokens" - ], - "title": "AIQUsage" + { + "type": "null" + } + ] }, - "AgentIntermediateStep": { - "properties": { - "tool_name": { - "type": "string", - "title": "Tool Name" - }, - "action_log": { - "type": "string", - "title": "Action Log" - }, - "tool_input": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object" - } - ], - "title": "Tool Input" - }, - "tool_output": { - "title": "Tool Output" - } + "intel": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/CveIntel" + }, + "type": "array" }, - "type": "object", - "required": [ - "tool_name", - "action_log", - "tool_input", - "tool_output" - ], - "title": "AgentIntermediateStep", - "description": "Represents info for an intermediate step taken by an agent." - }, - "AgentMorpheusEngineOutput": { - "properties": { - "vuln_id": { - "type": "string", - "title": "Vuln Id" - }, - "checklist": { - "items": { - "$ref": "#/components/schemas/ChecklistItemOutput" - }, - "type": "array", - "title": "Checklist" - }, - "summary": { - "type": "string", - "title": "Summary" - }, - "justification": { - "$ref": "#/components/schemas/JustificationOutput" - }, - "intel_score": { - "type": "integer", - "title": "Intel Score" - }, - "cvss": { - "anyOf": [ - { - "$ref": "#/components/schemas/CVSSOutput" - }, - { - "type": "null" - } - ] - } + { + "type": "null" + } + ], + "title": "Intel" + }, + "sbom": { + "anyOf": [ + { + "$ref": "#/components/schemas/SBOMInfo" }, - "type": "object", - "required": [ - "vuln_id", - "checklist", - "summary", - "justification", - "intel_score", - "cvss" - ], - "title": "AgentMorpheusEngineOutput", - "description": "Contains all output generated by the main Agent Morpheus LLM Engine for a given vulnerability.\n\n- vuln_id: the ID of the vulnerability being processed by the LLM engine.\n- checklist: a list of ChecklistItemOutput objects, each containing an input and a response from the LLM agent.\n- summary: a short summary of the checklist inputs and responses, generated by an LLM.\n- justification: a JustificationOutput object containing details of the model's justification decision.\n- intel_score: the intelligence score for the vulnerability.\n- cvss: a CVSSOutput object containing the CVSS score and vector string for the vulnerability." - }, - "AgentMorpheusInfo": { - "properties": { - "vdb": { - "anyOf": [ - { - "$ref": "#/components/schemas/VdbPaths" - }, - { - "type": "null" - } - ] - }, - "intel": { - "anyOf": [ - { - "items": { - "$ref": "#/components/schemas/CveIntel" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "title": "Intel" - }, - "sbom": { - "anyOf": [ - { - "$ref": "#/components/schemas/SBOMInfo" - }, - { - "type": "null" - } - ] - }, - "vulnerable_dependencies": { - "anyOf": [ - { - "items": { - "$ref": "#/components/schemas/VulnerableDependencies" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "title": "Vulnerable Dependencies" - } + { + "type": "null" + } + ] + }, + "vulnerable_dependencies": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/VulnerableDependencies" + }, + "type": "array" }, - "type": "object", - "title": "AgentMorpheusInfo", - "description": "Information used for decisioning in the Agent Morpheus engine. These information can all be automatically\ngenerated or retrieved by the pipeline from the input information.\n\n- vdb: paths to source code and documentation vector databases (VDBs) used to understand whether a vulnerability\n is exploitable in the source code.\n- intel: list of CveIntel objects representing intelligence for each vulnerability pulled from various vulnerability\n databases and APIs.\n- sbom: software bill of materials listing the packages and versions in the container image, used to understand\n whether the vulnerable package exists in the image.\n- vulnerable_dependencies: a list of VulnerableDependencies objects for each vuln_id, representing the SBOM packages\n and transitive dependencies that are vulnerable for the vuln_id." + { + "type": "null" + } + ], + "title": "Vulnerable Dependencies" }, - "AgentMorpheusInput-Input": { - "properties": { - "scan": { - "$ref": "#/components/schemas/ScanInfoInput" - }, - "image": { - "$ref": "#/components/schemas/ImageInfoInput-Input" - } + "checker_context": { + "anyOf": [ + { + "$ref": "#/components/schemas/PackageCheckerContext" }, - "type": "object", - "required": [ - "scan", - "image" - ], - "title": "AgentMorpheusInput", - "description": "Inputs required by the Agent Morpheus pipeline." + { + "type": "null" + } + ] + } + }, + "type": "object", + "title": "AgentMorpheusInfo", + "description": "Information used for decisioning in the Agent Morpheus engine. These information can all be automatically\ngenerated or retrieved by the pipeline from the input information.\n\n- vdb: paths to source code and documentation vector databases (VDBs) used to understand whether a vulnerability\n is exploitable in the source code.\n- intel: list of CveIntel objects representing intelligence for each vulnerability pulled from various vulnerability\n databases and APIs.\n- sbom: software bill of materials listing the packages and versions in the container image, used to understand\n whether the vulnerable package exists in the image.\n- vulnerable_dependencies: a list of VulnerableDependencies objects for each vuln_id, representing the SBOM packages\n and transitive dependencies that are vulnerable for the vuln_id." + }, + "AgentMorpheusInput-Input": { + "properties": { + "scan": { + "$ref": "#/components/schemas/ScanInfoInput" }, - "AgentMorpheusInput-Output": { - "properties": { - "scan": { - "$ref": "#/components/schemas/ScanInfoInput" - }, - "image": { - "$ref": "#/components/schemas/ImageInfoInput-Output" - } + "image": { + "$ref": "#/components/schemas/ImageInfoInput-Input" + }, + "credential_id": { + "anyOf": [ + { + "type": "string" }, - "type": "object", - "required": [ - "scan", - "image" - ], - "title": "AgentMorpheusInput", - "description": "Inputs required by the Agent Morpheus pipeline." + { + "type": "null" + } + ], + "title": "Credential Id" }, - "AgentMorpheusOutput": { - "properties": { - "input": { - "$ref": "#/components/schemas/AgentMorpheusInput-Output" - }, - "info": { - "$ref": "#/components/schemas/AgentMorpheusInfo" - }, - "output": { - "$ref": "#/components/schemas/OutputPayload" - } + "code_index_success": { + "anyOf": [ + { + "type": "boolean" }, - "type": "object", - "required": [ - "input", - "info", - "output" - ], - "title": "AgentMorpheusOutput", - "description": "\"\nThe final output of the Agent Morpheus pipeline.\nContains all fields in the AgentMorpheusEngineInput, plus the AgentMorpheusEngineOuput for each input vulnerability." - }, - "AudioContent": { - "properties": { - "type": { - "type": "string", - "const": "input_audio", - "title": "Type", - "default": "input_audio" - }, - "input_audio": { - "$ref": "#/components/schemas/InputAudio", - "default": { - "data": "default", - "format": "default" - } - } + { + "type": "null" + } + ], + "title": "Code Index Success" + }, + "failure_reason": { + "anyOf": [ + { + "type": "string" }, - "additionalProperties": false, - "type": "object", - "title": "AudioContent" + { + "type": "null" + } + ], + "title": "Failure Reason", + "default": "No failure reason provided" + } + }, + "type": "object", + "required": [ + "scan", + "image" + ], + "title": "AgentMorpheusInput", + "description": "Inputs required by the Agent Morpheus pipeline." + }, + "AgentMorpheusInput-Output": { + "properties": { + "scan": { + "$ref": "#/components/schemas/ScanInfoInput" }, - "BaseMetricV3": { - "properties": { - "cvssV3": { - "$ref": "#/components/schemas/CVSSV3" - }, - "exploitabilityScore": { - "type": "number", - "title": "Exploitabilityscore" - }, - "impactScore": { - "type": "number", - "title": "Impactscore" - } + "image": { + "$ref": "#/components/schemas/ImageInfoInput-Output" + }, + "credential_id": { + "anyOf": [ + { + "type": "string" }, - "type": "object", - "required": [ - "cvssV3", - "exploitabilityScore", - "impactScore" - ], - "title": "BaseMetricV3" - }, - "Bugzilla": { - "properties": { - "description": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Description" - }, - "id": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Id" - }, - "url": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Url" - } + { + "type": "null" + } + ], + "title": "Credential Id" + }, + "code_index_success": { + "anyOf": [ + { + "type": "boolean" }, - "type": "object", - "title": "Bugzilla" - }, - "CVSS": { - "properties": { - "score": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ], - "title": "Score" - }, - "vector_string": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Vector String" - } + { + "type": "null" + } + ], + "title": "Code Index Success" + }, + "failure_reason": { + "anyOf": [ + { + "type": "string" }, - "type": "object", - "title": "CVSS" - }, - "CVSS3": { - "properties": { - "cvss3_base_score": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ], - "title": "Cvss3 Base Score" - }, - "cvss3_scoring_vector": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Cvss3 Scoring Vector" - }, - "status": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Status" - } + { + "type": "null" + } + ], + "title": "Failure Reason", + "default": "No failure reason provided" + } + }, + "type": "object", + "required": [ + "scan", + "image" + ], + "title": "AgentMorpheusInput", + "description": "Inputs required by the Agent Morpheus pipeline." + }, + "AgentMorpheusOutput": { + "properties": { + "input": { + "$ref": "#/components/schemas/AgentMorpheusInput-Output" + }, + "info": { + "$ref": "#/components/schemas/AgentMorpheusInfo" + }, + "output": { + "$ref": "#/components/schemas/OutputPayload" + } + }, + "type": "object", + "required": [ + "input", + "info", + "output" + ], + "title": "AgentMorpheusOutput", + "description": "\"\nThe final output of the Agent Morpheus pipeline.\nContains all fields in the AgentMorpheusEngineInput, plus the AgentMorpheusEngineOuput for each input vulnerability." + }, + "AnalysisType": { + "type": "string", + "enum": [ + "image", + "source" + ], + "title": "AnalysisType" + }, + "AsyncGenerateRequest": { + "properties": { + "scan": { + "$ref": "#/components/schemas/ScanInfoInput" + }, + "image": { + "$ref": "#/components/schemas/ImageInfoInput-Input" + }, + "credential_id": { + "anyOf": [ + { + "type": "string" }, - "type": "object", - "title": "CVSS3" + { + "type": "null" + } + ], + "title": "Credential Id" }, - "CVSSOutput": { - "properties": { - "vector_string": { - "type": "string", - "title": "Vector String" - }, - "score": { - "type": "string", - "title": "Score" - } + "code_index_success": { + "anyOf": [ + { + "type": "boolean" }, - "type": "object", - "required": [ - "vector_string", - "score" - ], - "title": "CVSSOutput", - "description": "CVSS (Common Vulnerability Scoring System) representing the severity of a vulnerability in reference to an image.\n- vector_string: The CVSS vector string that encodes the metric values used to calculate the score.\n- score: The calculated CVSS base score representing the severity of the vulnerability in the given image." - }, - "CVSSV3": { - "properties": { - "attackComplexity": { - "type": "string", - "title": "Attackcomplexity" - }, - "attackVector": { - "type": "string", - "title": "Attackvector" - }, - "availabilityImpact": { - "type": "string", - "title": "Availabilityimpact" - }, - "baseScore": { - "type": "number", - "title": "Basescore" - }, - "baseSeverity": { - "type": "string", - "title": "Baseseverity" - }, - "confidentialityImpact": { - "type": "string", - "title": "Confidentialityimpact" - }, - "integrityImpact": { - "type": "string", - "title": "Integrityimpact" - }, - "privilegesRequired": { - "type": "string", - "title": "Privilegesrequired" - }, - "scope": { - "type": "string", - "title": "Scope" - }, - "userInteraction": { - "type": "string", - "title": "Userinteraction" - }, - "vectorString": { - "type": "string", - "title": "Vectorstring" - }, - "version": { - "type": "string", - "title": "Version" - } + { + "type": "null" + } + ], + "title": "Code Index Success" + }, + "failure_reason": { + "anyOf": [ + { + "type": "string" }, - "type": "object", - "required": [ - "attackComplexity", - "attackVector", - "availabilityImpact", - "baseScore", - "baseSeverity", - "confidentialityImpact", - "integrityImpact", - "privilegesRequired", - "scope", - "userInteraction", - "vectorString", - "version" - ], - "title": "CVSSV3" + { + "type": "null" + } + ], + "title": "Failure Reason", + "default": "No failure reason provided" }, - "CWE": { - "properties": { - "cwe_id": { - "type": "string", - "title": "Cwe Id" - }, - "name": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Name" - } + "job_id": { + "anyOf": [ + { + "type": "string" }, - "type": "object", - "required": [ - "cwe_id" - ], - "title": "CWE" + { + "type": "null" + } + ], + "title": "Job Id", + "description": "Unique identifier for the evaluation job" }, - "ChecklistItemOutput": { - "properties": { - "input": { - "type": "string", - "title": "Input" - }, - "response": { - "type": "string", - "title": "Response" - }, - "intermediate_steps": { - "anyOf": [ - { - "items": { - "$ref": "#/components/schemas/AgentIntermediateStep" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "title": "Intermediate Steps" - } + "sync_timeout": { + "type": "integer", + "maximum": 300, + "minimum": 0, + "title": "Sync Timeout", + "description": "Attempt to perform the job synchronously up until `sync_timeout` sectonds, if the job hasn't been completed by then a job_id will be returned with a status code of 202.", + "default": 0 + }, + "expiry_seconds": { + "type": "integer", + "maximum": 86400, + "minimum": 600, + "title": "Expiry Seconds", + "description": "Optional time (in seconds) before the job expires. Clamped between 600 (10 min) and 86400 (24h).", + "default": 3600 + } + }, + "type": "object", + "required": [ + "scan", + "image" + ], + "title": "AsyncGenerateRequest" + }, + "AsyncGenerateResponse": { + "properties": { + "job_id": { + "type": "string", + "title": "Job Id", + "description": "Unique identifier for the job" + }, + "status": { + "type": "string", + "title": "Status", + "description": "Current status of the job" + } + }, + "type": "object", + "required": [ + "job_id", + "status" + ], + "title": "AsyncGenerateResponse", + "description": "Response model for the async generation endpoint." + }, + "AsyncGenerationStatusResponse": { + "properties": { + "job_id": { + "type": "string", + "title": "Job Id", + "description": "Unique identifier for the evaluation job" + }, + "status": { + "type": "string", + "title": "Status", + "description": "Current status of the evaluation job" + }, + "error": { + "anyOf": [ + { + "type": "string" }, - "type": "object", - "required": [ - "input", - "response" - ], - "title": "ChecklistItemOutput", - "description": "Input, response, and intermediate steps for a single checklist item provided to the LLM agent." - }, - "Configuration": { - "properties": { - "package": { - "type": "string", - "title": "Package" - }, - "system": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "System" - }, - "versionStartExcluding": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Versionstartexcluding" - }, - "versionEndExcluding": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Versionendexcluding" - }, - "versionStartIncluding": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Versionstartincluding" - }, - "versionEndIncluding": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Versionendincluding" - } + { + "type": "null" + } + ], + "title": "Error", + "description": "Error message if the job failed" + }, + "created_at": { + "type": "string", + "format": "date-time", + "title": "Created At", + "description": "Timestamp when the job was created" + }, + "updated_at": { + "type": "string", + "format": "date-time", + "title": "Updated At", + "description": "Timestamp when the job was last updated" + }, + "expires_at": { + "anyOf": [ + { + "type": "string", + "format": "date-time" }, - "type": "object", - "required": [ - "package" - ], - "title": "Configuration" + { + "type": "null" + } + ], + "title": "Expires At", + "description": "Timestamp when the job will expire" }, - "CveIntel": { - "properties": { - "vuln_id": { - "type": "string", - "title": "Vuln Id" - }, - "ghsa": { - "anyOf": [ - { - "$ref": "#/components/schemas/CveIntelGhsa" - }, - { - "type": "null" - } - ] - }, - "nvd": { - "anyOf": [ - { - "$ref": "#/components/schemas/CveIntelNvd" - }, - { - "type": "null" - } - ] - }, - "rhsa": { - "anyOf": [ - { - "$ref": "#/components/schemas/CveIntelRhsa" - }, - { - "type": "null" - } - ] - }, - "ubuntu": { - "anyOf": [ - { - "$ref": "#/components/schemas/CveIntelUbuntu" - }, - { - "type": "null" - } - ] - }, - "epss": { - "anyOf": [ - { - "$ref": "#/components/schemas/CveIntelEpss" - }, - { - "type": "null" - } - ] - }, - "has_sufficient_intel_for_agent": { - "type": "boolean", - "title": "Has Sufficient Intel For Agent", - "description": "Logic to determine if the CVE has sufficient intel and can be passed to the agent.\n\nReturns\n-------\nbool\n True if enough intel has been found for the CVE", - "readOnly": true - } + "output": { + "anyOf": [ + { + "type": "object" }, - "type": "object", - "required": [ - "vuln_id", - "has_sufficient_intel_for_agent" - ], - "title": "CveIntel", - "description": "Information about a CVE (Common Vulnerabilities and Exposures) entry." - }, - "CveIntelEpss": { - "properties": { - "epss": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ], - "title": "Epss" - }, - "percentile": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ], - "title": "Percentile" - }, - "date": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Date" - } + { + "type": "null" + } + ], + "title": "Output", + "description": "Output of the generate request, this is only available if the job completed successfully." + } + }, + "type": "object", + "required": [ + "job_id", + "status", + "created_at", + "updated_at" + ], + "title": "AsyncGenerationStatusResponse" + }, + "AudioContent": { + "properties": { + "type": { + "type": "string", + "const": "input_audio", + "title": "Type", + "default": "input_audio" + }, + "input_audio": { + "$ref": "#/components/schemas/InputAudio", + "default": { + "data": "default", + "format": "default" + } + } + }, + "additionalProperties": false, + "type": "object", + "title": "AudioContent" + }, + "BaseMetricV3": { + "properties": { + "cvssV3": { + "$ref": "#/components/schemas/CVSSV3" + }, + "exploitabilityScore": { + "anyOf": [ + { + "type": "number" }, - "additionalProperties": true, - "type": "object", - "title": "CveIntelEpss", - "description": "Information about an EPSS (Elastic Product Security Service) entry." + { + "type": "null" + } + ], + "title": "Exploitabilityscore" }, - "CveIntelGhsa": { - "properties": { - "ghsa_id": { - "type": "string", - "title": "Ghsa Id" - }, - "cve_id": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Cve Id" - }, - "summary": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Summary" - }, - "description": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Description" - }, - "severity": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Severity" - }, - "vulnerabilities": { - "anyOf": [ - { - "items": {}, - "type": "array" - }, - { - "type": "null" - } - ], - "title": "Vulnerabilities" - }, - "cvss": { - "anyOf": [ - { - "$ref": "#/components/schemas/CVSS" - }, - { - "type": "null" - } - ] - }, - "cwes": { - "anyOf": [ - { - "items": { - "$ref": "#/components/schemas/CWE" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "title": "Cwes" - }, - "published_at": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Published At" - }, - "updated_at": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Updated At" - } + "impactScore": { + "anyOf": [ + { + "type": "number" }, - "additionalProperties": true, - "type": "object", - "required": [ - "ghsa_id" - ], - "title": "CveIntelGhsa", - "description": "Information about a GHSA (GitHub Security Advisory) entry." - }, - "CveIntelNvd": { - "properties": { - "cve_id": { - "type": "string", - "title": "Cve Id" - }, - "cve_description": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Cve Description" - }, - "cvss_vector": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Cvss Vector" - }, - "cvss_base_score": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ], - "title": "Cvss Base Score" - }, - "cvss_severity": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Cvss Severity" - }, - "cwe_name": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Cwe Name" - }, - "cwe_description": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Cwe Description" - }, - "cwe_extended_description": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Cwe Extended Description" - }, - "configurations": { - "anyOf": [ - { - "items": { - "$ref": "#/components/schemas/Configuration" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "title": "Configurations" - }, - "vendor_names": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "title": "Vendor Names" - }, - "references": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "title": "References" - }, - "disputed": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "null" - } - ], - "title": "Disputed" - }, - "published_at": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Published At" - }, - "updated_at": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Updated At" - } + { + "type": "null" + } + ], + "title": "Impactscore" + } + }, + "type": "object", + "required": [ + "cvssV3" + ], + "title": "BaseMetricV3" + }, + "Bugzilla": { + "properties": { + "description": { + "anyOf": [ + { + "type": "string" }, - "additionalProperties": true, - "type": "object", - "required": [ - "cve_id" - ], - "title": "CveIntelNvd", - "description": "Information about an NVD (National Vulnerability Database) entry." + { + "type": "null" + } + ], + "title": "Description" }, - "CveIntelRhsa": { - "properties": { - "bugzilla": { - "$ref": "#/components/schemas/Bugzilla" - }, - "details": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "title": "Details" - }, - "statement": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Statement" - }, - "package_state": { - "anyOf": [ - { - "items": { - "$ref": "#/components/schemas/PackageState" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "title": "Package State" - }, - "upstream_fix": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Upstream Fix" - }, - "cvss3": { - "anyOf": [ - { - "$ref": "#/components/schemas/CVSS3" - }, - { - "type": "null" - } - ] - } + "id": { + "anyOf": [ + { + "type": "string" }, - "additionalProperties": true, - "type": "object", - "title": "CveIntelRhsa", - "description": "Information about a RHSA (Red Hat Security Advisory) entry." - }, - "CveIntelUbuntu": { - "properties": { - "description": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Description" - }, - "notes": { - "anyOf": [ - { - "items": { - "$ref": "#/components/schemas/Note" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "title": "Notes" - }, - "notices": { - "anyOf": [ - { - "items": {}, - "type": "array" - }, - { - "type": "null" - } - ], - "title": "Notices" - }, - "priority": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Priority" - }, - "ubuntu_description": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Ubuntu Description" - }, - "impact": { - "anyOf": [ - { - "$ref": "#/components/schemas/Impact" - }, - { - "type": "null" - } - ] - } - }, - "additionalProperties": true, - "type": "object", - "title": "CveIntelUbuntu", - "description": "Information about a Ubuntu CVE entry." - }, - "DependencyPackage": { - "properties": { - "system": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "System" - }, - "name": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Name" - }, - "version": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Version" - }, - "relation": { - "anyOf": [ - { - "type": "string", - "enum": [ - "SELF", - "DIRECT", - "INDIRECT" - ] - }, - { - "type": "null" - } - ], - "title": "Relation" - } + { + "type": "null" + } + ], + "title": "Id" + }, + "url": { + "anyOf": [ + { + "type": "string" }, - "type": "object", - "title": "DependencyPackage", - "description": "Information about a dependency package as obtained from deps.dev API for a given SBOM package." - }, - "FileSBOMInfoInput": { - "properties": { - "_type": { - "type": "string", - "const": "file", - "title": "Type", - "description": "The type of the object", - "default": "file" - }, - "file_path": { - "type": "string", - "title": "File Path" - } + { + "type": "null" + } + ], + "title": "Url" + } + }, + "type": "object", + "title": "Bugzilla" + }, + "CVSS": { + "properties": { + "score": { + "anyOf": [ + { + "type": "number" }, - "type": "object", - "required": [ - "file_path" - ], - "title": "FileSBOMInfoInput", - "description": "A file path pointing to a Software Bill of Materials file." - }, - "HTTPSBOMInfoInput": { - "properties": { - "_type": { - "type": "string", - "const": "http", - "title": "Type", - "description": "The type of the object", - "default": "http" - }, - "url": { - "type": "string", - "minLength": 1, - "format": "uri", - "title": "Url" - } + { + "type": "null" + } + ], + "title": "Score" + }, + "vector_string": { + "anyOf": [ + { + "type": "string" }, - "type": "object", - "required": [ - "url" - ], - "title": "HTTPSBOMInfoInput", - "description": "A URL pointing to a Software Bill of Materials file." - }, - "HTTPValidationError": { - "properties": { - "detail": { - "items": { - "$ref": "#/components/schemas/ValidationError" - }, - "type": "array", - "title": "Detail" - } + { + "type": "null" + } + ], + "title": "Vector String" + } + }, + "type": "object", + "title": "CVSS" + }, + "CVSS3": { + "properties": { + "cvss3_base_score": { + "anyOf": [ + { + "type": "number" }, - "type": "object", - "title": "HTTPValidationError" + { + "type": "null" + } + ], + "title": "Cvss3 Base Score" }, - "ImageContent": { - "properties": { - "type": { - "type": "string", - "const": "image_url", - "title": "Type", - "default": "image_url" - }, - "image_url": { - "$ref": "#/components/schemas/ImageUrl", - "default": { - "url": "http://default.com/" - } - } + "cvss3_scoring_vector": { + "anyOf": [ + { + "type": "string" }, - "additionalProperties": false, - "type": "object", - "title": "ImageContent" - }, - "ImageInfoInput-Input": { - "properties": { - "name": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Name" - }, - "tag": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Tag" - }, - "digest": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Digest" - }, - "platform": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Platform" - }, - "feed_group": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Feed Group" - }, - "analysis_type": { - "type": "string", - "enum": [ - "image", - "source" - ], - "title": "Analysis Type" - }, - "ecosystem": { - "type": "string", - "enum": [ - "go", - "python", - "javascript", - "java", - "c", - ], - "title": "Ecosystem" - }, - "manifest_path": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Manifest Path" - }, - "source_info": { - "items": { - "$ref": "#/components/schemas/SourceDocumentsInfo" - }, - "type": "array", - "title": "Source Info" - }, - "sbom_info": { - "oneOf": [ - { - "$ref": "#/components/schemas/ManualSBOMInfoInput" - }, - { - "$ref": "#/components/schemas/FileSBOMInfoInput" - }, - { - "$ref": "#/components/schemas/HTTPSBOMInfoInput" - } - ], - "title": "Sbom Info" - } + { + "type": "null" + } + ], + "title": "Cvss3 Scoring Vector" + }, + "status": { + "anyOf": [ + { + "type": "string" }, - "type": "object", - "required": [ - "analysis_type", - "source_info", - "sbom_info" - ], - "title": "ImageInfoInput", - "description": "Information about a container image, including the source information and sbom information." - }, - "ImageInfoInput-Output": { - "properties": { - "name": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Name" - }, - "tag": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Tag" - }, - "digest": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Digest" - }, - "platform": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Platform" - }, - "feed_group": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Feed Group" - }, - "analysis_type": { - "type": "string", - "enum": [ - "image", - "source" - ], - "title": "Analysis Type" - }, - "ecosystem": { - "type": "string", - "enum": [ - "go", - "python", - "javascript", - "java", - "c", - ], - "title": "Ecosystem" - }, - "manifest_path": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Manifest Path" - }, - "source_info": { - "items": { - "$ref": "#/components/schemas/SourceDocumentsInfo" - }, - "type": "array", - "title": "Source Info" - }, - "sbom_info": { - "oneOf": [ - { - "$ref": "#/components/schemas/ManualSBOMInfoInput" - }, - { - "$ref": "#/components/schemas/FileSBOMInfoInput" - }, - { - "$ref": "#/components/schemas/HTTPSBOMInfoInput" - } - ], - "title": "Sbom Info" - } + { + "type": "null" + } + ], + "title": "Status" + } + }, + "type": "object", + "title": "CVSS3" + }, + "CVSSOutput": { + "properties": { + "vector_string": { + "type": "string", + "title": "Vector String" + }, + "score": { + "type": "string", + "title": "Score" + } + }, + "type": "object", + "required": [ + "vector_string", + "score" + ], + "title": "CVSSOutput", + "description": "CVSS (Common Vulnerability Scoring System) representing the severity of a vulnerability in reference to an image.\n- vector_string: The CVSS vector string that encodes the metric values used to calculate the score.\n- score: The calculated CVSS base score representing the severity of the vulnerability in the given image." + }, + "CVSSV3": { + "properties": { + "attackComplexity": { + "type": "string", + "title": "Attackcomplexity" + }, + "attackVector": { + "type": "string", + "title": "Attackvector" + }, + "availabilityImpact": { + "type": "string", + "title": "Availabilityimpact" + }, + "baseScore": { + "type": "number", + "title": "Basescore" + }, + "baseSeverity": { + "type": "string", + "title": "Baseseverity" + }, + "confidentialityImpact": { + "type": "string", + "title": "Confidentialityimpact" + }, + "integrityImpact": { + "type": "string", + "title": "Integrityimpact" + }, + "privilegesRequired": { + "type": "string", + "title": "Privilegesrequired" + }, + "scope": { + "type": "string", + "title": "Scope" + }, + "userInteraction": { + "type": "string", + "title": "Userinteraction" + }, + "vectorString": { + "type": "string", + "title": "Vectorstring" + }, + "version": { + "type": "string", + "title": "Version" + } + }, + "type": "object", + "required": [ + "attackComplexity", + "attackVector", + "availabilityImpact", + "baseScore", + "baseSeverity", + "confidentialityImpact", + "integrityImpact", + "privilegesRequired", + "scope", + "userInteraction", + "vectorString", + "version" + ], + "title": "CVSSV3" + }, + "CWE": { + "properties": { + "cwe_id": { + "type": "string", + "title": "Cwe Id" + }, + "name": { + "anyOf": [ + { + "type": "string" }, - "type": "object", - "required": [ - "analysis_type", - "source_info", - "sbom_info" - ], - "title": "ImageInfoInput", - "description": "Information about a container image, including the source information and sbom information." - }, - "ImageUrl": { - "properties": { - "url": { - "type": "string", - "maxLength": 2083, - "minLength": 1, - "format": "uri", - "title": "Url", - "default": "http://default.com/" - } + { + "type": "null" + } + ], + "title": "Name" + } + }, + "type": "object", + "required": [ + "cwe_id" + ], + "title": "CWE" + }, + "ChatRequest": { + "properties": { + "messages": { + "items": { + "$ref": "#/components/schemas/Message" + }, + "type": "array", + "title": "Messages" + }, + "model": { + "anyOf": [ + { + "type": "string" }, - "type": "object", - "title": "ImageUrl" + { + "type": "null" + } + ], + "title": "Model", + "description": "name of the model to use" }, - "Impact": { - "properties": { - "baseMetricV3": { - "$ref": "#/components/schemas/BaseMetricV3" - } + "frequency_penalty": { + "anyOf": [ + { + "type": "number" }, - "type": "object", - "required": [ - "baseMetricV3" - ], - "title": "Impact" - }, - "InputAudio": { - "properties": { - "data": { - "type": "string", - "title": "Data", - "default": "default" - }, - "format": { - "type": "string", - "title": "Format", - "default": "default" - } + { + "type": "null" + } + ], + "title": "Frequency Penalty", + "description": "Penalty for new tokens based on frequency in text", + "default": 0 + }, + "logit_bias": { + "anyOf": [ + { + "additionalProperties": { + "type": "number" + }, + "type": "object" }, - "type": "object", - "title": "InputAudio" + { + "type": "null" + } + ], + "title": "Logit Bias", + "description": "Modify likelihood of specified tokens appearing" }, - "JustificationOutput": { - "properties": { - "label": { - "type": "string", - "title": "Label" - }, - "reason": { - "type": "string", - "title": "Reason" - }, - "status": { - "type": "string", - "enum": [ - "TRUE", - "FALSE", - "UNKNOWN" - ], - "title": "Status" - } + "logprobs": { + "anyOf": [ + { + "type": "boolean" }, - "type": "object", - "required": [ - "label", - "reason", - "status" - ], - "title": "JustificationOutput", - "description": "Final justification for the vulnerability.\n\n- label: a categorical justification label classifying the status of an image against a given vulnerability, e.g.\n code_not_present, code_not_reachable, false_positive.\n- reason: a human-readable explanation for why justification label was selected.\n- status: a ternary status (TRUE, FALSE, OR UNKNOWN) that indicates whether the image can be exploited for a given\n vulnerability. Determined based on a mapping from the justification label." - }, - "ManualSBOMInfoInput": { - "properties": { - "_type": { - "type": "string", - "const": "manual", - "title": "Type", - "description": "The type of the object", - "default": "manual" - }, - "packages": { - "items": { - "$ref": "#/components/schemas/SBOMPackage" - }, - "type": "array", - "title": "Packages" - } + { + "type": "null" + } + ], + "title": "Logprobs", + "description": "Whether to return log probabilities" + }, + "top_logprobs": { + "anyOf": [ + { + "type": "integer" }, - "type": "object", - "required": [ - "packages" - ], - "title": "ManualSBOMInfoInput", - "description": "Manually provided Software Bill of Materials, consisting of a list of SBOMPackage objects." - }, - "Message": { - "properties": { - "content": { - "anyOf": [ - { - "type": "string" - }, - { - "items": { - "oneOf": [ - { - "$ref": "#/components/schemas/TextContent" - }, - { - "$ref": "#/components/schemas/ImageContent" - }, - { - "$ref": "#/components/schemas/AudioContent" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "image_url": "#/components/schemas/ImageContent", - "input_audio": "#/components/schemas/AudioContent", - "text": "#/components/schemas/TextContent" - } - } - }, - "type": "array" - } - ], - "title": "Content" + { + "type": "null" + } + ], + "title": "Top Logprobs", + "description": "Number of most likely tokens to return" + }, + "max_tokens": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Max Tokens", + "description": "Maximum number of tokens to generate" + }, + "n": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "N", + "description": "Number of chat completion choices to generate", + "default": 1 + }, + "presence_penalty": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Presence Penalty", + "description": "Penalty for new tokens based on presence in text", + "default": 0 + }, + "response_format": { + "anyOf": [ + { + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Response Format", + "description": "Response format specification" + }, + "seed": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Seed", + "description": "Random seed for deterministic sampling" + }, + "service_tier": { + "anyOf": [ + { + "type": "string", + "enum": [ + "auto", + "default" + ] + }, + { + "type": "null" + } + ], + "title": "Service Tier", + "description": "Service tier for the request" + }, + "stream": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Stream", + "description": "Whether to stream partial message deltas", + "default": false + }, + "stream_options": { + "anyOf": [ + { + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Stream Options", + "description": "Options for streaming" + }, + "temperature": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Temperature", + "description": "Sampling temperature between 0 and 2", + "default": 1 + }, + "top_p": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Top P", + "description": "Nucleus sampling parameter" + }, + "tools": { + "anyOf": [ + { + "items": { + "type": "object" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Tools", + "description": "List of tools the model may call" + }, + "tool_choice": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Tool Choice", + "description": "Controls which tool is called" + }, + "parallel_tool_calls": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Parallel Tool Calls", + "description": "Whether to enable parallel function calling", + "default": true + }, + "user": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "User", + "description": "Unique identifier representing end-user" + } + }, + "additionalProperties": true, + "type": "object", + "required": [ + "messages" + ], + "title": "ChatRequest", + "description": "ChatRequest is a data model that represents a request to the NAT chat API.\nFully compatible with OpenAI Chat Completions API specification.", + "example": { + "messages": [ + { + "content": "who are you?", + "role": "user" + } + ], + "model": "nvidia/nemotron", + "stream": false, + "temperature": 0.7 + } + }, + "ChatResponse": { + "properties": { + "id": { + "type": "string", + "title": "Id" + }, + "object": { + "type": "string", + "title": "Object", + "default": "chat.completion" + }, + "model": { + "type": "string", + "title": "Model", + "default": "" + }, + "created": { + "type": "integer", + "title": "Created" + }, + "choices": { + "items": { + "$ref": "#/components/schemas/Choice" + }, + "type": "array", + "title": "Choices" + }, + "usage": { + "anyOf": [ + { + "$ref": "#/components/schemas/Usage" + }, + { + "type": "null" + } + ] + }, + "system_fingerprint": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "System Fingerprint" + }, + "service_tier": { + "anyOf": [ + { + "type": "string", + "enum": [ + "scale", + "default" + ] + }, + { + "type": "null" + } + ], + "title": "Service Tier" + } + }, + "additionalProperties": true, + "type": "object", + "required": [ + "id", + "created", + "choices" + ], + "title": "ChatResponse", + "description": "ChatResponse is a data model that represents a response from the NAT chat API.\nFully compatible with OpenAI Chat Completions API specification." + }, + "ChatResponseChunk": { + "properties": { + "id": { + "type": "string", + "title": "Id" + }, + "choices": { + "items": { + "$ref": "#/components/schemas/Choice" + }, + "type": "array", + "title": "Choices" + }, + "created": { + "type": "integer", + "title": "Created" + }, + "model": { + "type": "string", + "title": "Model", + "default": "" + }, + "object": { + "type": "string", + "title": "Object", + "default": "chat.completion.chunk" + }, + "system_fingerprint": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "System Fingerprint" + }, + "service_tier": { + "anyOf": [ + { + "type": "string", + "enum": [ + "scale", + "default" + ] + }, + { + "type": "null" + } + ], + "title": "Service Tier" + }, + "usage": { + "anyOf": [ + { + "$ref": "#/components/schemas/Usage" + }, + { + "type": "null" + } + ] + } + }, + "additionalProperties": true, + "type": "object", + "required": [ + "id", + "choices", + "created" + ], + "title": "ChatResponseChunk", + "description": "ChatResponseChunk is a data model that represents a response chunk from the NAT chat streaming API.\nFully compatible with OpenAI Chat Completions API specification." + }, + "ChecklistItemOutput": { + "properties": { + "input": { + "type": "string", + "title": "Input" + }, + "response": { + "type": "string", + "title": "Response" + }, + "intermediate_steps": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/AgentIntermediateStep" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Intermediate Steps" + } + }, + "type": "object", + "required": [ + "input", + "response" + ], + "title": "ChecklistItemOutput", + "description": "Input, response, and intermediate steps for a single checklist item provided to the LLM agent." + }, + "Choice": { + "properties": { + "message": { + "anyOf": [ + { + "$ref": "#/components/schemas/ChoiceMessage" + }, + { + "type": "null" + } + ] + }, + "delta": { + "anyOf": [ + { + "$ref": "#/components/schemas/ChoiceDelta" + }, + { + "type": "null" + } + ] + }, + "finish_reason": { + "anyOf": [ + { + "type": "string", + "enum": [ + "stop", + "length", + "tool_calls", + "content_filter", + "function_call" + ] + }, + { + "type": "null" + } + ], + "title": "Finish Reason" + }, + "index": { + "type": "integer", + "title": "Index" + } + }, + "additionalProperties": true, + "type": "object", + "required": [ + "index" + ], + "title": "Choice" + }, + "ChoiceDelta": { + "properties": { + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Content" + }, + "role": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Role" + } + }, + "type": "object", + "title": "ChoiceDelta", + "description": "Delta object for streaming responses (OpenAI-compatible)" + }, + "ChoiceMessage": { + "properties": { + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Content" + }, + "role": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Role" + } + }, + "type": "object", + "title": "ChoiceMessage" + }, + "Configuration": { + "properties": { + "package": { + "type": "string", + "title": "Package" + }, + "vendor": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Vendor" + }, + "system": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "System" + }, + "versionStartExcluding": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Versionstartexcluding" + }, + "versionEndExcluding": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Versionendexcluding" + }, + "versionStartIncluding": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Versionstartincluding" + }, + "versionEndIncluding": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Versionendincluding" + } + }, + "type": "object", + "required": [ + "package" + ], + "title": "Configuration" + }, + "CveIntel": { + "properties": { + "vuln_id": { + "type": "string", + "title": "Vuln Id" + }, + "ghsa": { + "anyOf": [ + { + "$ref": "#/components/schemas/CveIntelGhsa" + }, + { + "type": "null" + } + ] + }, + "nvd": { + "anyOf": [ + { + "$ref": "#/components/schemas/CveIntelNvd" + }, + { + "type": "null" + } + ] + }, + "rhsa": { + "anyOf": [ + { + "$ref": "#/components/schemas/CveIntelRhsa" + }, + { + "type": "null" + } + ] + }, + "ubuntu": { + "anyOf": [ + { + "$ref": "#/components/schemas/CveIntelUbuntu" + }, + { + "type": "null" + } + ] + }, + "epss": { + "anyOf": [ + { + "$ref": "#/components/schemas/CveIntelEpss" + }, + { + "type": "null" + } + ] + }, + "plugin_data": { + "items": { + "$ref": "#/components/schemas/IntelPluginData" + }, + "type": "array", + "title": "Plugin Data", + "default": [] + }, + "intel_score": { + "type": "integer", + "title": "Intel Score", + "default": 0 + }, + "has_sufficient_intel_for_agent": { + "type": "boolean", + "title": "Has Sufficient Intel For Agent", + "description": "Logic to determine if the CVE has sufficient intel and can be passed to the agent.\n\nReturns\n-------\nbool\n True if enough intel has been found for the CVE", + "readOnly": true + } + }, + "type": "object", + "required": [ + "vuln_id", + "has_sufficient_intel_for_agent" + ], + "title": "CveIntel", + "description": "Information about a CVE (Common Vulnerabilities and Exposures) entry." + }, + "CveIntelEpss": { + "properties": { + "epss": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Epss" + }, + "percentile": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Percentile" + }, + "date": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Date" + } + }, + "additionalProperties": true, + "type": "object", + "title": "CveIntelEpss", + "description": "Information about an EPSS (Elastic Product Security Service) entry." + }, + "CveIntelGhsa": { + "properties": { + "ghsa_id": { + "type": "string", + "title": "Ghsa Id" + }, + "cve_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Cve Id" + }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Summary" + }, + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Description" + }, + "severity": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Severity" + }, + "vulnerabilities": { + "anyOf": [ + { + "items": { + + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Vulnerabilities" + }, + "cvss": { + "anyOf": [ + { + "$ref": "#/components/schemas/CVSS" + }, + { + "type": "null" + } + ] + }, + "cwes": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/CWE" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Cwes" + }, + "published_at": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Published At" + }, + "updated_at": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Updated At" + } + }, + "additionalProperties": true, + "type": "object", + "required": [ + "ghsa_id" + ], + "title": "CveIntelGhsa", + "description": "Information about a GHSA (GitHub Security Advisory) entry." + }, + "CveIntelNvd": { + "properties": { + "cve_id": { + "type": "string", + "title": "Cve Id" + }, + "cve_description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Cve Description" + }, + "cvss_vector": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Cvss Vector" + }, + "cvss_base_score": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Cvss Base Score" + }, + "cvss_severity": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Cvss Severity" + }, + "cwe_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Cwe Id" + }, + "cwe_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Cwe Name" + }, + "cwe_description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Cwe Description" + }, + "cwe_extended_description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Cwe Extended Description" + }, + "configurations": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/Configuration" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Configurations" + }, + "vendor_names": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Vendor Names" + }, + "references": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "References" + }, + "disputed": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Disputed" + }, + "published_at": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Published At" + }, + "updated_at": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Updated At" + } + }, + "additionalProperties": true, + "type": "object", + "required": [ + "cve_id" + ], + "title": "CveIntelNvd", + "description": "Information about an NVD (National Vulnerability Database) entry." + }, + "CveIntelRhsa": { + "properties": { + "bugzilla": { + "$ref": "#/components/schemas/Bugzilla" + }, + "details": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Details" + }, + "statement": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Statement" + }, + "package_state": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/PackageState" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Package State" + }, + "upstream_fix": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Upstream Fix" + }, + "cvss3": { + "anyOf": [ + { + "$ref": "#/components/schemas/CVSS3" + }, + { + "type": "null" + } + ] + } + }, + "additionalProperties": true, + "type": "object", + "title": "CveIntelRhsa", + "description": "Information about a RHSA (Red Hat Security Advisory) entry." + }, + "CveIntelUbuntu": { + "properties": { + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Description" + }, + "notes": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/Note" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Notes" + }, + "notices": { + "anyOf": [ + { + "items": { + + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Notices" + }, + "priority": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Priority" + }, + "ubuntu_description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Ubuntu Description" + }, + "impact": { + "anyOf": [ + { + "$ref": "#/components/schemas/Impact" + }, + { + "type": "null" + } + ] + }, + "patches": { + "anyOf": [ + { + "additionalProperties": { + "items": { + "type": "string" + }, + "type": "array" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Patches", + "description": "Map of package name to patch refs (e.g., 'upstream: https://github.com/.../commit/...')" + } + }, + "additionalProperties": true, + "type": "object", + "title": "CveIntelUbuntu", + "description": "Information about a Ubuntu CVE entry." + }, + "DependencyPackage": { + "properties": { + "system": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "System" + }, + "name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Name" + }, + "version": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Version" + }, + "relation": { + "anyOf": [ + { + "type": "string", + "enum": [ + "SELF", + "DIRECT", + "INDIRECT" + ] + }, + { + "type": "null" + } + ], + "title": "Relation" + } + }, + "type": "object", + "title": "DependencyPackage", + "description": "Information about a dependency package as obtained from deps.dev API for a given SBOM package." + }, + "Ecosystem": { + "type": "string", + "enum": [ + "go", + "python", + "javascript", + "java", + "c" + ], + "title": "Ecosystem" + }, + "EnumIdentifyResult": { + "type": "string", + "enum": [ + "yes", + "no", + "unknown" + ], + "title": "EnumIdentifyResult", + "description": "Result of the PackageIdentify phase for a single CVE." + }, + "EvaluateRequest": { + "properties": { + "config_file": { + "type": "string", + "title": "Config File", + "description": "Path to the configuration file for evaluation" + }, + "job_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Job Id", + "description": "Unique identifier for the evaluation job" + }, + "reps": { + "type": "integer", + "exclusiveMinimum": 0, + "title": "Reps", + "description": "Number of repetitions for the evaluation, defaults to 1", + "default": 1 + }, + "expiry_seconds": { + "type": "integer", + "exclusiveMinimum": 0, + "title": "Expiry Seconds", + "description": "Optional time (in seconds) before the job expires. Clamped between 600 (10 min) and 86400 (24h).", + "default": 3600 + } + }, + "type": "object", + "required": [ + "config_file" + ], + "title": "EvaluateRequest", + "description": "Request model for the evaluate endpoint." + }, + "EvaluateResponse": { + "properties": { + "job_id": { + "type": "string", + "title": "Job Id", + "description": "Unique identifier for the job" + }, + "status": { + "type": "string", + "title": "Status", + "description": "Current status of the job" + } + }, + "type": "object", + "required": [ + "job_id", + "status" + ], + "title": "EvaluateResponse", + "description": "Response model for the evaluate endpoint." + }, + "EvaluateStatusResponse": { + "properties": { + "job_id": { + "type": "string", + "title": "Job Id", + "description": "Unique identifier for the evaluation job" + }, + "status": { + "type": "string", + "title": "Status", + "description": "Current status of the evaluation job" + }, + "error": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Error", + "description": "Error message if the job failed" + }, + "created_at": { + "type": "string", + "format": "date-time", + "title": "Created At", + "description": "Timestamp when the job was created" + }, + "updated_at": { + "type": "string", + "format": "date-time", + "title": "Updated At", + "description": "Timestamp when the job was last updated" + }, + "expires_at": { + "anyOf": [ + { + "type": "string", + "format": "date-time" + }, + { + "type": "null" + } + ], + "title": "Expires At", + "description": "Timestamp when the job will expire" + }, + "config_file": { + "type": "string", + "title": "Config File", + "description": "Path to the configuration file used for evaluation" + }, + "output_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Output Path", + "description": "Path to the output file if the job completed successfully" + } + }, + "type": "object", + "required": [ + "job_id", + "status", + "created_at", + "updated_at", + "config_file" + ], + "title": "EvaluateStatusResponse", + "description": "Response model for the evaluate status endpoint." + }, + "FileSBOMInfoInput": { + "properties": { + "_type": { + "type": "string", + "const": "file", + "title": "Type", + "description": "The type of the object", + "default": "file" + }, + "file_path": { + "type": "string", + "title": "File Path" + } + }, + "type": "object", + "required": [ + "file_path" + ], + "title": "FileSBOMInfoInput", + "description": "A file path pointing to a Software Bill of Materials file." + }, + "HTTPSBOMInfoInput": { + "properties": { + "_type": { + "type": "string", + "const": "http", + "title": "Type", + "description": "The type of the object", + "default": "http" + }, + "url": { + "type": "string", + "minLength": 1, + "format": "uri", + "title": "Url" + } + }, + "type": "object", + "required": [ + "url" + ], + "title": "HTTPSBOMInfoInput", + "description": "A URL pointing to a Software Bill of Materials file." + }, + "HTTPValidationError": { + "properties": { + "detail": { + "items": { + "$ref": "#/components/schemas/ValidationError" + }, + "type": "array", + "title": "Detail" + } + }, + "type": "object", + "title": "HTTPValidationError" + }, + "HealthStatusResponse": { + "properties": { + "status": { + "type": "string", + "title": "Status" + } + }, + "type": "object", + "required": [ + "status" + ], + "title": "HealthStatusResponse" + }, + "ImageContent": { + "properties": { + "type": { + "type": "string", + "const": "image_url", + "title": "Type", + "default": "image_url" + }, + "image_url": { + "$ref": "#/components/schemas/ImageUrl", + "default": { + "url": "http://default.com/" + } + } + }, + "additionalProperties": false, + "type": "object", + "title": "ImageContent" + }, + "ImageInfoInput-Input": { + "properties": { + "name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Name" + }, + "tag": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Tag" + }, + "digest": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Digest" + }, + "platform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Platform" + }, + "feed_group": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Feed Group" + }, + "ecosystem": { + "anyOf": [ + { + "$ref": "#/components/schemas/Ecosystem" + }, + { + "type": "null" + } + ] + }, + "manifest_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Manifest Path" + }, + "analysis_type": { + "$ref": "#/components/schemas/AnalysisType" + }, + "pipeline_mode": { + "$ref": "#/components/schemas/PipelineMode", + "default": "full_pipeline" + }, + "target_package": { + "anyOf": [ + { + "$ref": "#/components/schemas/TargetPackage" + }, + { + "type": "null" + } + ] + }, + "source_info": { + "items": { + "$ref": "#/components/schemas/SourceDocumentsInfo" + }, + "type": "array", + "title": "Source Info", + "default": [] + }, + "sbom_info": { + "anyOf": [ + { + "oneOf": [ + { + "$ref": "#/components/schemas/ManualSBOMInfoInput" + }, + { + "$ref": "#/components/schemas/FileSBOMInfoInput" + }, + { + "$ref": "#/components/schemas/HTTPSBOMInfoInput" + } + ] + }, + { + "type": "null" + } + ], + "title": "Sbom Info" + } + }, + "type": "object", + "required": [ + "analysis_type" + ], + "title": "ImageInfoInput", + "description": "Information about a container image, including the source information and sbom information." + }, + "ImageInfoInput-Output": { + "properties": { + "name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Name" + }, + "tag": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Tag" + }, + "digest": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Digest" + }, + "platform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Platform" + }, + "feed_group": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Feed Group" + }, + "ecosystem": { + "anyOf": [ + { + "$ref": "#/components/schemas/Ecosystem" + }, + { + "type": "null" + } + ] + }, + "manifest_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Manifest Path" + }, + "analysis_type": { + "$ref": "#/components/schemas/AnalysisType" + }, + "pipeline_mode": { + "$ref": "#/components/schemas/PipelineMode", + "default": "full_pipeline" + }, + "target_package": { + "anyOf": [ + { + "$ref": "#/components/schemas/TargetPackage" + }, + { + "type": "null" + } + ] + }, + "source_info": { + "items": { + "$ref": "#/components/schemas/SourceDocumentsInfo" + }, + "type": "array", + "title": "Source Info", + "default": [] + }, + "sbom_info": { + "anyOf": [ + { + "oneOf": [ + { + "$ref": "#/components/schemas/ManualSBOMInfoInput" }, - "role": { - "type": "string", - "title": "Role" + { + "$ref": "#/components/schemas/FileSBOMInfoInput" + }, + { + "$ref": "#/components/schemas/HTTPSBOMInfoInput" } + ] + }, + { + "type": "null" + } + ], + "title": "Sbom Info" + } + }, + "type": "object", + "required": [ + "analysis_type" + ], + "title": "ImageInfoInput", + "description": "Information about a container image, including the source information and sbom information." + }, + "ImageUrl": { + "properties": { + "url": { + "type": "string", + "maxLength": 2083, + "minLength": 1, + "format": "uri", + "title": "Url", + "default": "http://default.com/" + } + }, + "type": "object", + "title": "ImageUrl" + }, + "Impact": { + "properties": { + "baseMetricV3": { + "$ref": "#/components/schemas/BaseMetricV3" + } + }, + "type": "object", + "required": [ + "baseMetricV3" + ], + "title": "Impact" + }, + "InputAudio": { + "properties": { + "data": { + "type": "string", + "title": "Data", + "default": "default" + }, + "format": { + "type": "string", + "title": "Format", + "default": "default" + } + }, + "type": "object", + "title": "InputAudio" + }, + "IntelPluginData": { + "properties": { + "label": { + "type": "string", + "title": "Label" + }, + "description": { + "type": "string", + "title": "Description" + } + }, + "type": "object", + "required": [ + "label", + "description" + ], + "title": "IntelPluginData" + }, + "JustificationOutput": { + "properties": { + "label": { + "type": "string", + "title": "Label" + }, + "reason": { + "type": "string", + "title": "Reason" + }, + "status": { + "type": "string", + "enum": [ + "TRUE", + "FALSE", + "UNKNOWN" + ], + "title": "Status" + } + }, + "type": "object", + "required": [ + "label", + "reason", + "status" + ], + "title": "JustificationOutput", + "description": "Final justification for the vulnerability.\n\n- label: a categorical justification label classifying the status of an image against a given vulnerability, e.g.\n code_not_present, code_not_reachable, false_positive.\n- reason: a human-readable explanation for why justification label was selected.\n- status: a ternary status (TRUE, FALSE, OR UNKNOWN) that indicates whether the image can be exploited for a given\n vulnerability. Determined based on a mapping from the justification label." + }, + "L1InvestigationResult": { + "properties": { + "downstream_report": { + "anyOf": [ + { + "type": "object" }, - "type": "object", - "required": [ - "content", - "role" - ], - "title": "Message" - }, - "Note": { - "properties": { - "author": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Author" - }, - "note": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Note" - } + { + "type": "null" + } + ], + "title": "Downstream Report", + "description": "Serialized DownstreamSearchReport from L1 investigation" + }, + "upstream_report": { + "anyOf": [ + { + "type": "object" }, - "type": "object", - "title": "Note" - }, - "OutputPayload": { - "properties": { - "analysis": { - "items": { - "$ref": "#/components/schemas/AgentMorpheusEngineOutput" - }, - "type": "array", - "title": "Analysis" - }, - "vex": { - "anyOf": [ - { - "type": "object" - }, - { - "type": "null" - } - ], - "title": "Vex" - } + { + "type": "null" + } + ], + "title": "Upstream Report", + "description": "Serialized UpstreamSearchReport from L1 investigation" + }, + "l1_agent_answer": { + "anyOf": [ + { + "type": "string" }, - "type": "object", - "required": [ - "analysis", - "vex" - ], - "title": "OutputPayload", - "description": "Wrapper for final pipeline results.\n- analysis: per-vulnerability analysis results\n- vex: the vulnerability exploitability exchange document JSON" - }, - "PackageState": { - "properties": { - "product_name": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Product Name" - }, - "fix_state": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Fix State" - }, - "package_name": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Package Name" - }, - "cpe": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Cpe" - } + { + "type": "null" + } + ], + "title": "L1 Agent Answer", + "description": "Final answer from the L1 ReAct agent" + }, + "vulnerability_intel": { + "anyOf": [ + { + "$ref": "#/components/schemas/VulnerabilityIntel" }, - "type": "object", - "title": "PackageState" - }, - "SBOMInfo": { - "properties": { - "packages": { - "items": { - "$ref": "#/components/schemas/SBOMPackage" - }, - "type": "array", - "title": "Packages" - } + { + "type": "null" + } + ], + "description": "Structured vulnerability intelligence extracted from CVE advisories and patches" + }, + "preliminary_verdict": { + "type": "string", + "enum": [ + "vulnerable", + "protected", + "not_present", + "uncertain" + ], + "title": "Preliminary Verdict", + "description": "L1 verdict before L2 refinement", + "default": "uncertain" + }, + "confidence": { + "type": "number", + "maximum": 1, + "minimum": 0, + "title": "Confidence", + "description": "Confidence in the preliminary verdict", + "default": 0 + } + }, + "type": "object", + "title": "L1InvestigationResult", + "description": "Intermediate result from L1 investigation, input to L2 or report generation." + }, + "L2BuildResult": { + "properties": { + "compilation_status": { + "type": "string", + "enum": [ + "compiled", + "not_compiled", + "unknown" + ], + "title": "Compilation Status", + "description": "Whether vulnerable code is compiled into the binary", + "default": "unknown" + }, + "compilation_confidence": { + "type": "number", + "maximum": 1, + "minimum": 0, + "title": "Compilation Confidence", + "description": "Confidence in compilation status", + "default": 0 + }, + "compilation_evidence": { + "anyOf": [ + { + "type": "string" }, - "type": "object", - "required": [ - "packages" - ], - "title": "SBOMInfo", - "description": "List of SBOMPackage objects representing the packages found in the input image." - }, - "SBOMPackage": { - "properties": { - "name": { - "type": "string", - "title": "Name" - }, - "version": { - "type": "string", - "title": "Version" - }, - "path": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Path" - }, - "system": { - "type": "string", - "title": "System" - } + { + "type": "null" + } + ], + "title": "Compilation Evidence", + "description": "Evidence supporting compilation status" + }, + "hardening_relevant": { + "anyOf": [ + { + "type": "boolean" }, - "type": "object", - "required": [ - "name", - "version", - "system" - ], - "title": "SBOMPackage", - "description": "Information about a single package in the container image's Software Bill of Materials (SBOM)." - }, - "ScanInfoInput": { - "properties": { - "id": { - "type": "string", - "title": "Id" - }, - "type": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Type" - }, - "started_at": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Started At" - }, - "completed_at": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Completed At" - }, - "vulns": { - "items": { - "$ref": "#/components/schemas/VulnInfo" - }, - "type": "array", - "minItems": 1, - "title": "Vulns" - } + { + "type": "null" + } + ], + "title": "Hardening Relevant", + "description": "Whether detected hardening flags are relevant to the CVE" + }, + "hardening_flags": { + "items": { + "type": "string" + }, + "type": "array", + "title": "Hardening Flags", + "description": "Hardening flags detected in build log or binary" + }, + "hardening_rationale": { + "anyOf": [ + { + "type": "string" }, - "type": "object", - "required": [ - "vulns" - ], - "title": "ScanInfoInput", - "description": "Information about a unique scan for a container image against a list of vulnerabilies." - }, - "SourceDocumentsInfo": { - "properties": { - "type": { - "type": "string", - "enum": [ - "code", - "doc" - ], - "title": "Type" - }, - "git_repo": { - "type": "string", - "minLength": 1, - "title": "Git Repo" - }, - "ref": { - "type": "string", - "minLength": 1, - "title": "Ref" - }, - "include": { - "items": { - "type": "string" - }, - "type": "array", - "title": "Include", - "default": [ - "*.py", - "*.ipynb" - ] - }, - "exclude": { - "items": { - "type": "string" - }, - "type": "array", - "title": "Exclude", - "default": [] - } + { + "type": "null" + } + ], + "title": "Hardening Rationale", + "description": "Rationale for hardening relevance judgment" + }, + "l2_override_verdict": { + "enum": [ + "not_vulnerable", + "vulnerable_mitigated", + null], + "title": "L2 Override Verdict", + "description": "L2 verdict override (if any)" + } + }, + "type": "object", + "title": "L2BuildResult", + "description": "Result from L2 Build Agent (BuildCompilationCheck + HardeningCheck)." + }, + "ManualSBOMInfoInput": { + "properties": { + "_type": { + "type": "string", + "const": "manual", + "title": "Type", + "description": "The type of the object", + "default": "manual" + }, + "packages": { + "items": { + "$ref": "#/components/schemas/SBOMPackage" + }, + "type": "array", + "title": "Packages" + } + }, + "type": "object", + "required": [ + "packages" + ], + "title": "ManualSBOMInfoInput", + "description": "Manually provided Software Bill of Materials, consisting of a list of SBOMPackage objects." + }, + "Message": { + "properties": { + "content": { + "anyOf": [ + { + "type": "string" }, - "type": "object", - "required": [ - "type", - "git_repo", - "ref" - ], - "title": "SourceDocumentsInfo", - "description": "Information about the source documents for the container image.\n\n- type: document type.\n- git_repo: git repo URL where the source documents can be cloned.\n- ref: git reference, such as tag/branch/commit_id\n- include: file extensions to include when indexing the source documents.\n- exclude: file extensions to exclude when indexing the source documents." - }, - "TextContent": { - "properties": { - "type": { - "type": "string", - "const": "text", - "title": "Type", - "default": "text" - }, - "text": { - "type": "string", - "title": "Text", - "default": "default" + { + "items": { + "oneOf": [ + { + "$ref": "#/components/schemas/TextContent" + }, + { + "$ref": "#/components/schemas/ImageContent" + }, + { + "$ref": "#/components/schemas/AudioContent" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "image_url": "#/components/schemas/ImageContent", + "input_audio": "#/components/schemas/AudioContent", + "text": "#/components/schemas/TextContent" + } } + }, + "type": "array" + } + ], + "title": "Content" + }, + "role": { + "type": "string", + "title": "Role" + } + }, + "type": "object", + "required": [ + "content", + "role" + ], + "title": "Message" + }, + "Note": { + "properties": { + "author": { + "anyOf": [ + { + "type": "string" }, - "additionalProperties": false, - "type": "object", - "title": "TextContent" - }, - "ValidationError": { - "properties": { - "loc": { - "items": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "integer" - } - ] - }, - "type": "array", - "title": "Location" - }, - "msg": { - "type": "string", - "title": "Message" - }, - "type": { - "type": "string", - "title": "Error Type" - } + { + "type": "null" + } + ], + "title": "Author" + }, + "note": { + "anyOf": [ + { + "type": "string" }, - "type": "object", - "required": [ - "loc", - "msg", - "type" - ], - "title": "ValidationError" - }, - "VdbPaths": { - "properties": { - "code_vdb_path": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Code Vdb Path" - }, - "doc_vdb_path": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Doc Vdb Path" - }, - "code_index_path": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Code Index Path" - } + { + "type": "null" + } + ], + "title": "Note" + } + }, + "type": "object", + "title": "Note" + }, + "OutputPayload": { + "properties": { + "analysis": { + "items": { + "$ref": "#/components/schemas/AgentMorpheusEngineOutput" + }, + "type": "array", + "title": "Analysis" + }, + "vex": { + "anyOf": [ + { + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Vex" + } + }, + "type": "object", + "required": [ + "analysis", + "vex" + ], + "title": "OutputPayload", + "description": "Wrapper for final pipeline results.\n- analysis: per-vulnerability analysis results\n- vex: the vulnerability exploitability exchange document JSON" + }, + "PackageCheckerContext": { + "properties": { + "status": { + "anyOf": [ + { + "$ref": "#/components/schemas/PackageCheckerStatus" }, - "type": "object", - "title": "VdbPaths", - "description": "Paths to where the generated VDBs are stored." + { + "type": "null" + } + ] }, - "VulnInfo": { - "properties": { - "vuln_id": { - "type": "string", - "title": "Vuln Id" - }, - "description": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Description" - }, - "score": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ], - "title": "Score" - }, - "severity": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Severity" - }, - "published_date": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Published Date" - }, - "last_modified_date": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Last Modified Date" - }, - "url": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Url" - }, - "feed_group": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Feed Group" - }, - "package": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Package" - }, - "package_version": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Package Version" - }, - "package_name": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Package Name" - }, - "package_type": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Package Type" - } + "source_key": { + "anyOf": [ + { + "type": "string" }, - "additionalProperties": true, - "type": "object", - "required": [ - "vuln_id" - ], - "title": "VulnInfo", - "description": "Information about a vulnerability." - }, - "VulnerableDependencies": { - "properties": { - "vuln_id": { - "type": "string", - "title": "Vuln Id" - }, - "vuln_package_intel_sources": { - "items": { - "type": "string" - }, - "type": "array", - "title": "Vuln Package Intel Sources" - }, - "vulnerable_sbom_packages": { - "items": { - "$ref": "#/components/schemas/VulnerableSBOMPackage" - }, - "type": "array", - "title": "Vulnerable Sbom Packages" - } + { + "type": "null" + } + ], + "title": "Source Key" + }, + "artifacts": { + "$ref": "#/components/schemas/AcquiredArtifacts" + }, + "identify_result": { + "$ref": "#/components/schemas/PackageIdentifyResult" + }, + "l1_result": { + "anyOf": [ + { + "$ref": "#/components/schemas/L1InvestigationResult" }, - "type": "object", - "required": [ - "vuln_id", - "vuln_package_intel_sources", - "vulnerable_sbom_packages" - ], - "title": "VulnerableDependencies", - "description": "Information about the vulnerable SBOM packages associated with the vuln_id.\n\n- vuln_id: vulnerability ID (e.g. CVE ID, GHSA ID) associated with the vulnerable package list.\n- vuln_package_intel_sources: list of sources (e.g. \"ghsa\", \"nvd\", \"ubuntu\", \"rhsa\") that provided\n the vulnerable package/version intel for the vuln_id.\n- vulnerable_sbom_packages: list of VulnerableSBOMPackage objects, representing the SBOM packages that are\n vulnerable for a given vuln_id." - }, - "VulnerableSBOMPackage": { - "properties": { - "name": { - "type": "string", - "title": "Name" - }, - "version": { - "type": "string", - "title": "Version" - }, - "vulnerable_dependency_package": { - "$ref": "#/components/schemas/DependencyPackage" - } + { + "type": "null" + } + ], + "description": "Result from L1 Code Agent investigation" + }, + "l2_result": { + "anyOf": [ + { + "$ref": "#/components/schemas/L2BuildResult" }, - "type": "object", - "required": [ - "name", - "version", - "vulnerable_dependency_package" - ], - "title": "VulnerableSBOMPackage", - "description": "Information about a vulnerable SBOM package and its related vulnerable dependency package.\n\n- name: SBOM package name\n- version: SBOM package version\n- vulnerable_dependency_package: DependencyPackage object with info about the vulnerable dependency package.\n If an SBOM package itself is vulnerable, the vulnerable_dependency_package.relation will be \"SELF\".\n Otherwise, if it is vulnerable due to its dependency, the vulnerable_dependency_package.relation will be either\n \"DIRECT\" or \"INDIRECT\"." + { + "type": "null" + } + ], + "description": "Result from L2 Build Agent (optional)" + } + }, + "type": "object", + "title": "PackageCheckerContext", + "description": "Consolidates all checker-specific state on AgentMorpheusInfo." + }, + "PackageCheckerStatus": { + "type": "integer", + "enum": [0, 1, 2, 3, 4, 5], + "title": "PackageCheckerStatus", + "description": "Per-CVE status codes produced by the PackageIdentify phase." + }, + "PackageIdentifyResult": { + "properties": { + "affected_rpm_list": { + "items": { + "type": "string" + }, + "type": "array", + "title": "Affected Rpm List", + "default": [] + }, + "fixed_rpm_list": { + "items": { + "type": "string" + }, + "type": "array", + "title": "Fixed Rpm List", + "default": [] + }, + "is_target_package_affected": { + "$ref": "#/components/schemas/EnumIdentifyResult", + "default": "unknown" + }, + "is_target_package_fixed": { + "$ref": "#/components/schemas/EnumIdentifyResult", + "default": "unknown" + }, + "conclusion_reason": { + "type": "string", + "title": "Conclusion Reason", + "description": "Detailed explanation of why the package was determined to be vulnerable or not vulnerable", + "default": "" + } + }, + "type": "object", + "title": "PackageIdentifyResult", + "description": "Result of the PackageIdentify phase for a single CVE." + }, + "PackageState": { + "properties": { + "product_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Product Name" + }, + "fix_state": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Fix State" + }, + "package_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Package Name" + }, + "cpe": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Cpe" + } + }, + "type": "object", + "title": "PackageState" + }, + "PipelineMode": { + "type": "string", + "enum": [ + "full_pipeline", + "rpm_package_checker" + ], + "title": "PipelineMode", + "description": "Controls which investigation path the pipeline takes after process_sbom.\nOrthogonal to AnalysisType (input format) -- any combination is valid." + }, + "ResponseIntermediateStep": { + "properties": { + "id": { + "type": "string", + "title": "Id" + }, + "parent_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Parent Id" + }, + "type": { + "type": "string", + "title": "Type", + "default": "markdown" + }, + "name": { + "type": "string", + "title": "Name" + }, + "payload": { + "type": "string", + "title": "Payload" + } + }, + "additionalProperties": true, + "type": "object", + "required": [ + "id", + "name", + "payload" + ], + "title": "ResponseIntermediateStep", + "description": "ResponseSerializedStep is a data model that represents a serialized step in the NAT chat streaming API." + }, + "SBOMInfo": { + "properties": { + "packages": { + "items": { + "$ref": "#/components/schemas/SBOMPackage" + }, + "type": "array", + "title": "Packages" + } + }, + "type": "object", + "required": [ + "packages" + ], + "title": "SBOMInfo", + "description": "List of SBOMPackage objects representing the packages found in the input image." + }, + "SBOMPackage": { + "properties": { + "name": { + "type": "string", + "title": "Name" + }, + "version": { + "type": "string", + "title": "Version" + }, + "path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Path" + }, + "system": { + "type": "string", + "title": "System" + } + }, + "type": "object", + "required": [ + "name", + "version", + "system" + ], + "title": "SBOMPackage", + "description": "Information about a single package in the container image's Software Bill of Materials (SBOM)." + }, + "ScanInfoInput": { + "properties": { + "id": { + "type": "string", + "title": "Id" + }, + "type": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Type" + }, + "started_at": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Started At", + "description": "Scan start time as ISO-8601 with UTC offset (e.g. ...+00:00)." + }, + "completed_at": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Completed At", + "description": "Scan completion time as ISO-8601 with UTC offset (e.g. ...+00:00)." + }, + "vulns": { + "items": { + "$ref": "#/components/schemas/VulnInfo" + }, + "type": "array", + "minItems": 1, + "title": "Vulns" + } + }, + "type": "object", + "required": [ + "vulns" + ], + "title": "ScanInfoInput", + "description": "Information about a unique scan for a container image against a list of vulnerabilies." + }, + "SourceDocumentsInfo": { + "properties": { + "type": { + "type": "string", + "enum": [ + "code", + "doc" + ], + "title": "Type" + }, + "git_repo": { + "type": "string", + "minLength": 1, + "title": "Git Repo" + }, + "ref": { + "type": "string", + "minLength": 1, + "title": "Ref" + }, + "include": { + "items": { + "type": "string" + }, + "type": "array", + "title": "Include", + "default": [ + "*.py", + "*.ipynb" + ] + }, + "exclude": { + "items": { + "type": "string" + }, + "type": "array", + "title": "Exclude", + "default": [] + } + }, + "type": "object", + "required": [ + "type", + "git_repo", + "ref" + ], + "title": "SourceDocumentsInfo", + "description": "Information about the source documents for the container image.\n\n- type: document type.\n- git_repo: git repo URL where the source documents can be cloned.\n- ref: git reference, such as tag/branch/commit_id\n- include: file extensions to include when indexing the source documents.\n- exclude: file extensions to exclude when indexing the source documents." + }, + "TargetPackage": { + "properties": { + "name": { + "type": "string", + "title": "Name" + }, + "version": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Version" + }, + "release": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Release" + }, + "arch": { + "type": "string", + "title": "Arch", + "default": "x86_64" + } + }, + "type": "object", + "required": [ + "name" + ], + "title": "TargetPackage", + "description": "A package to investigate." + }, + "TextContent": { + "properties": { + "type": { + "type": "string", + "const": "text", + "title": "Type", + "default": "text" + }, + "text": { + "type": "string", + "title": "Text", + "default": "default" + } + }, + "additionalProperties": false, + "type": "object", + "title": "TextContent" + }, + "Usage": { + "properties": { + "prompt_tokens": { + "type": "integer", + "title": "Prompt Tokens" + }, + "completion_tokens": { + "type": "integer", + "title": "Completion Tokens" + }, + "total_tokens": { + "type": "integer", + "title": "Total Tokens" + } + }, + "type": "object", + "required": [ + "prompt_tokens", + "completion_tokens", + "total_tokens" + ], + "title": "Usage" + }, + "ValidationError": { + "properties": { + "loc": { + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] + }, + "type": "array", + "title": "Location" + }, + "msg": { + "type": "string", + "title": "Message" + }, + "type": { + "type": "string", + "title": "Error Type" + } + }, + "type": "object", + "required": [ + "loc", + "msg", + "type" + ], + "title": "ValidationError" + }, + "VdbPaths": { + "properties": { + "code_vdb_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Code Vdb Path" + }, + "doc_vdb_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Doc Vdb Path" + }, + "code_index_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Code Index Path" + } + }, + "type": "object", + "title": "VdbPaths", + "description": "Paths to where the generated VDBs are stored." + }, + "VulnInfo": { + "properties": { + "vuln_id": { + "type": "string", + "title": "Vuln Id" + }, + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Description" + }, + "score": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Score" + }, + "severity": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Severity" + }, + "published_date": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Published Date" + }, + "last_modified_date": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Last Modified Date" + }, + "url": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Url" + }, + "feed_group": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Feed Group" + }, + "package": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Package" + }, + "package_version": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Package Version" + }, + "package_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Package Name" + }, + "package_type": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Package Type" + } + }, + "additionalProperties": true, + "type": "object", + "required": [ + "vuln_id" + ], + "title": "VulnInfo", + "description": "Information about a vulnerability." + }, + "VulnerabilityIntel": { + "properties": { + "affected_files": { + "items": { + "type": "string" + }, + "type": "array", + "title": "Affected Files", + "description": "Source file paths likely to contain vulnerable code" + }, + "vulnerable_functions": { + "items": { + "type": "string" + }, + "type": "array", + "title": "Vulnerable Functions", + "description": "Function names that contain or handle the vulnerability" + }, + "vulnerable_variables": { + "items": { + "type": "string" + }, + "type": "array", + "title": "Vulnerable Variables", + "description": "Variable names involved in the vulnerability" + }, + "vulnerable_patterns": { + "items": { + "type": "string" + }, + "type": "array", + "title": "Vulnerable Patterns", + "description": "Code patterns/snippets indicating vulnerable code (from - lines)" + }, + "fix_patterns": { + "items": { + "type": "string" + }, + "type": "array", + "title": "Fix Patterns", + "description": "Code patterns/snippets indicating fixed code (from + lines)" + }, + "root_cause": { + "type": "string", + "title": "Root Cause", + "description": "Technical explanation of why the code is vulnerable", + "default": "" + }, + "vulnerability_type": { + "type": "string", + "title": "Vulnerability Type", + "description": "Category: buffer_overflow, integer_overflow, use_after_free, null_deref, etc.", + "default": "" + }, + "search_keywords": { + "items": { + "type": "string" + }, + "type": "array", + "title": "Search Keywords", + "description": "Recommended grep patterns ordered by specificity (most specific first)" + }, + "affected_architectures": { + "type": "string", + "enum": [ + "32-bit", + "64-bit", + "both" + ], + "title": "Affected Architectures", + "description": "Which CPU architectures are affected: 32-bit only, 64-bit only, or both (default)", + "default": "both" + }, + "is_downstream_patch_available": { + "type": "boolean", + "title": "Is Downstream Patch Available", + "description": "True if a CVE-specific patch file exists in the downstream package", + "default": false + }, + "is_patch_applied_in_build": { + "type": "boolean", + "title": "Is Patch Applied In Build", + "description": "True if the patch was confirmed applied in build logs", + "default": false + }, + "patch_file_name": { + "type": "string", + "title": "Patch File Name", + "description": "Name of the CVE-specific patch file (if available)", + "default": "" + } + }, + "type": "object", + "title": "VulnerabilityIntel", + "description": "Structured intelligence extracted from CVE advisories and patches.\n\nUsed to provide grep-ready patterns and context for L1 agent source searches." + }, + "VulnerableDependencies": { + "properties": { + "vuln_id": { + "type": "string", + "title": "Vuln Id" + }, + "vuln_package_intel_sources": { + "items": { + "type": "string" + }, + "type": "array", + "title": "Vuln Package Intel Sources" + }, + "vulnerable_sbom_packages": { + "items": { + "$ref": "#/components/schemas/VulnerableSBOMPackage" + }, + "type": "array", + "title": "Vulnerable Sbom Packages" + } + }, + "type": "object", + "required": [ + "vuln_id", + "vuln_package_intel_sources", + "vulnerable_sbom_packages" + ], + "title": "VulnerableDependencies", + "description": "Information about the vulnerable SBOM packages associated with the vuln_id.\n\n- vuln_id: vulnerability ID (e.g. CVE ID, GHSA ID) associated with the vulnerable package list.\n- vuln_package_intel_sources: list of sources (e.g. \"ghsa\", \"nvd\", \"ubuntu\", \"rhsa\") that provided\n the vulnerable package/version intel for the vuln_id.\n- vulnerable_sbom_packages: list of VulnerableSBOMPackage objects, representing the SBOM packages that are\n vulnerable for a given vuln_id." + }, + "VulnerableSBOMPackage": { + "properties": { + "name": { + "type": "string", + "title": "Name" + }, + "version": { + "type": "string", + "title": "Version" + }, + "vulnerable_dependency_package": { + "$ref": "#/components/schemas/DependencyPackage" } + }, + "type": "object", + "required": [ + "name", + "version", + "vulnerable_dependency_package" + ], + "title": "VulnerableSBOMPackage", + "description": "Information about a vulnerable SBOM package and its related vulnerable dependency package.\n\n- name: SBOM package name\n- version: SBOM package version\n- vulnerable_dependency_package: DependencyPackage object with info about the vulnerable dependency package.\n If an SBOM package itself is vulnerable, the vulnerable_dependency_package.relation will be \"SELF\".\n Otherwise, if it is vulnerable due to its dependency, the vulnerable_dependency_package.relation will be either\n \"DIRECT\" or \"INDIRECT\"." } + } } -} \ No newline at end of file + } \ No newline at end of file diff --git a/src/vuln_analysis/data_models/output.py b/src/vuln_analysis/data_models/output.py index 2af85daa9..b182c9fa2 100644 --- a/src/vuln_analysis/data_models/output.py +++ b/src/vuln_analysis/data_models/output.py @@ -90,6 +90,7 @@ class AgentMorpheusEngineOutput(BaseModel): justification: JustificationOutput intel_score: int cvss: CVSSOutput | None + details: str | None = None class OutputPayload(BaseModel): diff --git a/src/vuln_analysis/functions/build_agent_graph_defs.py b/src/vuln_analysis/functions/build_agent_graph_defs.py new file mode 100644 index 000000000..698ff1b84 --- /dev/null +++ b/src/vuln_analysis/functions/build_agent_graph_defs.py @@ -0,0 +1,692 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Graph definitions for the L2 Build Agent (BuildCompilationCheck). + +Houses the LangGraph state schema and structured-output schemas for BuildHarvestReport. +Prompt templates are in vuln_analysis.utils.rpm_checker_prompts. +""" + +from __future__ import annotations + +import logging +import re +from pathlib import Path +from typing import Literal, NotRequired + +from langgraph.graph import MessagesState +from pydantic import BaseModel, Field + +from exploit_iq_commons.utils.hardening_kb import HardeningEntry +from vuln_analysis.functions.react_internals import CheckerThought, Observation +from vuln_analysis.utils.rpm_checker_prompts import ( + L2_CONFIG_SYS_PROMPT, + L2_CONFIG_PROMPT_TEMPLATE, + L2_CONFIG_THOUGHT_INSTRUCTIONS, + L2_HARDENING_SYS_PROMPT, + L2_HARDENING_PROMPT_TEMPLATE, + L2_HARDENING_THOUGHT_INSTRUCTIONS, + L2_COMPILATION_VERDICT_PROMPT, + L2_HARDENING_VERDICT_PROMPT, + L2_COMPREHENSION_PROMPT, + L2_MEMORY_UPDATE_PROMPT, + L2_HARDENING_COMPREHENSION_PROMPT, + L2_HARDENING_MEMORY_UPDATE_PROMPT, +) +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Data Models +# --------------------------------------------------------------------------- + + +class BuildHarvestReport(BaseModel): + """Deterministic data harvested from build artifacts. + + Extracted during the data_harvest_node before the ReAct loop begins. + + Key vulnerability-relevant data: + - Feature disable flags that prevent vulnerable code from being compiled: + - OpenSSL style: no-sm2, no-ssl3, no-md5, no-asm + - Autoconf style: --disable-feature, --without-feature + - CMake style: -DENABLE_FEATURE=OFF + - Feature enable flags that explicitly enable optional features: + - Autoconf style: --enable-feature, --with-feature + - CMake style: -DENABLE_FEATURE=ON + - Architecture flags to understand target platform + - Hardening flags relevant to the CVE's CWE class + + Note: Compiled files are NOT pre-extracted. The LLM searches the build log + for affected files from l1_result.vulnerability_intel.affected_files during the ReAct loop. + """ + + disabled_features: list[str] = Field( + default_factory=list, + description="Feature-disabling flags from build log (e.g., '-DOPENSSL_NO_SM2', '-DNO_GZIP')", + ) + spec_disabled_features: list[str] = Field( + default_factory=list, + description="Feature-disabling flags from spec %build section (e.g., 'no-sm2', '--disable-ssl3', '--without-openssl')", + ) + enabled_features: list[str] = Field( + default_factory=list, + description="Feature-enabling flags from build log (e.g., '-DENABLE_LDAP', '-DLDAP_ENABLED')", + ) + spec_enabled_features: list[str] = Field( + default_factory=list, + description="Feature-enabling flags from spec %build section (e.g., '--enable-ldap', '--with-openssl')", + ) + expected_hardening: list[HardeningEntry] = Field( + default_factory=list, + description="Hardening flags relevant to the CVE's CWE, with descriptions for LLM context", + ) + build_architecture: Literal["32-bit", "64-bit", "unknown"] = Field( + default="unknown", + description="Target architecture from -m64/-m32 flags or build target (x86_64/i686)", + ) + linked_libraries: list[str] = Field( + default_factory=list, + description="Libraries linked at build time (extracted from -l flags in build log)", + ) + built_subpackages: list[str] = Field( + default_factory=list, + description="Subpackages defined in spec %package sections", + ) + excluded_subpackages: list[str] = Field( + default_factory=list, + description="Subpackages excluded via %bcond_without, ExcludeArch, or conditional guards", + ) + kernel_config: dict[str, str] = Field( + default_factory=dict, + description="Kernel CONFIG_* options (y/n/m) - only populated for kernel packages", + ) + is_kernel: bool = Field( + default=False, + description="True if this is a kernel package (detected by Kconfig presence)", + ) + kernel_config_path: str | None = Field( + default=None, + description="Path to kernel config file (e.g., kernel-x86_64-rhel.config) for LLM grep access", + ) + kernel_source_root: str | None = Field( + default=None, + description="Path to kernel source tree root (directory containing Kconfig)", + ) + + +class L2CompileVerdictExtraction(BaseModel): + """LLM-extracted verdict from L2 agent final answer.""" + + compilation_status: str = Field( + description="Whether vulnerable code is compiled into the binary: one of 'compiled', 'not_compiled', or 'unknown'" + ) + confidence: float = Field(description="Confidence in the verdict (0.0 to 1.0)") + reasoning: str = Field(description="Brief explanation of the verdict") + +class L2HardeningVerdictExtraction(BaseModel): + """LLM-extracted verdict from L2 Hardening investigation.""" + hardening_status: str = Field( + description="Whether hardening flags mitigate the vulnerability: one of 'mitigated', 'not_mitigated', 'not_applicable', or 'unknown'" + ) + hardening_flags: list[str] = Field( + default_factory=list, + description="List of specific hardening flags found (e.g., -fstack-protector-strong, -D_FORTIFY_SOURCE=2)", + ) + confidence: float = Field(description="Confidence in the verdict (0.0 to 1.0)") + reasoning: str = Field(description="Brief explanation of the verdict") + + +# --------------------------------------------------------------------------- +# Graph State +# --------------------------------------------------------------------------- + + +class BuildAgentState(MessagesState): + """LangGraph state for the L2 Build Agent.""" + + harvest_report: NotRequired[BuildHarvestReport | None] + vulnerability_intel_str: NotRequired[str | None] + l1_preliminary_verdict: NotRequired[str | None] + runtime_prompt: NotRequired[str | None] + thought: NotRequired[CheckerThought | None] + observation: NotRequired[Observation | None] + step: NotRequired[int] + max_steps: NotRequired[int] + L2CompileVerdict: NotRequired[L2CompileVerdictExtraction | None] + L2HardeningVerdict: NotRequired[L2HardeningVerdictExtraction | None] + evidence_sources: NotRequired[list[str] | None] + +# --------------------------------------------------------------------------- +# Spec File Parsing Helpers +# --------------------------------------------------------------------------- + + +def _extract_spec_build_section(spec_path: Path) -> str: + """Extract the %build section from an RPM spec file. + + The %build section contains configure/Configure commands with feature flags + that determine what code is compiled. + + Args: + spec_path: Path to the RPM spec file + + Returns: + The raw %build section content, or empty string if not found + """ + try: + content = spec_path.read_text(encoding="utf-8", errors="replace") + except OSError as e: + logger.warning("Failed to read spec file %s: %s", spec_path, e) + return "" + + # Find %build section (ends at next RPM section marker or EOF) + # Only match actual section markers, not macros like %configure, %ifarch, %{name} + # Common RPM sections: %prep, %build, %install, %check, %files, %post, %pre, + # %preun, %postun, %changelog, %package, %description + match = re.search( + r"^%build\s*\n(.*?)(?=^%(prep|install|check|files|post|pre|preun|postun|changelog|package|description)\b|\Z)", + content, + re.MULTILINE | re.DOTALL, + ) + return match.group(1).strip() if match else "" + + +def _extract_spec_disabled_features(build_section: str) -> list[str]: + """Extract feature-disable flags from spec %build section. + + Recognizes patterns from common build systems: + - OpenSSL style: no-sm2, no-ssl3, no-asm + - Autoconf style: --disable-feature, --without-feature + - CMake style: -DENABLE_FEATURE=OFF + + Args: + build_section: The raw %build section content + + Returns: + Sorted list of disabled feature names (without prefix) + """ + disabled: set[str] = set() + + # OpenSSL style: no-feature (e.g., no-sm2, no-ssl3, no-asm) + disabled.update(re.findall(r"\bno-(\w+)", build_section)) + + # Autoconf style: --disable-feature (e.g., --disable-static) + disabled.update(re.findall(r"--disable-(\w+)", build_section)) + + # Autoconf style: --without-feature (e.g., --without-openssl) + disabled.update(re.findall(r"--without-(\w+)", build_section)) + + # CMake style: -DENABLE_FEATURE=OFF or =0 or =FALSE + disabled.update( + re.findall(r"-DENABLE_(\w+)=(?:OFF|0|FALSE)", build_section, re.IGNORECASE) + ) + + return sorted(disabled) + + +def _extract_spec_enabled_features(build_section: str) -> list[str]: + """Extract feature-enable flags from spec %build section. + + Recognizes patterns from common build systems: + - Autoconf style: --enable-feature, --with-feature + - CMake style: -DENABLE_FEATURE=ON + + Args: + build_section: The raw %build section content + + Returns: + Sorted list of enabled feature names (without prefix) + """ + enabled: set[str] = set() + + # Autoconf style: --enable-feature (e.g., --enable-ldap) + enabled.update(re.findall(r"--enable-(\w+)", build_section)) + + # Autoconf style: --with-feature (e.g., --with-openssl) + enabled.update(re.findall(r"--with-(\w+)", build_section)) + + # CMake style: -DENABLE_FEATURE=ON or =1 or =TRUE + enabled.update( + re.findall(r"-DENABLE_(\w+)=(?:ON|1|TRUE)", build_section, re.IGNORECASE) + ) + + return sorted(enabled) + + +def _extract_linked_libraries(build_log_content: str) -> list[str]: + """Extract libraries linked at build time from build log. + + Parses -l flags from gcc/g++/ld commands in the build log. + + Args: + build_log_content: Full content of the build log file + + Returns: + Sorted list of unique library names (without 'lib' prefix or path) + """ + libraries: set[str] = set() + + # Match -l flags preceded by whitespace (actual linker flags) + # Library names must start with a letter (filters out false positives like "ib500" from "lib500") + libraries.update(re.findall(r"(?:^|\s)-l([a-zA-Z][a-zA-Z0-9_]*)", build_log_content, re.MULTILINE)) + + return sorted(libraries) + + +def _extract_spec_subpackages(spec_content: str) -> tuple[list[str], list[str]]: + """Extract built and excluded subpackages from spec file. + + Parses %package directives and conditional guards to determine + which subpackages are built vs excluded. + + Args: + spec_content: Full content of the spec file + + Returns: + Tuple of (built_subpackages, excluded_subpackages) + """ + built: set[str] = set() + excluded: set[str] = set() + + # Extract all %package definitions + # Handles: %package -n libfoo, %package devel, %package -n foo-libs + for match in re.finditer(r"^%package\s+(?:-n\s+)?(\S+)", spec_content, re.MULTILINE): + pkg_name = match.group(1) + # Skip macro expansions that we can't resolve + if not pkg_name.startswith("%"): + built.add(pkg_name) + + # Extract %bcond_without directives (features disabled by default) + for match in re.finditer(r"%bcond_without\s+(\w+)", spec_content): + excluded.add(match.group(1)) + + # Extract %bcond_with directives (features that must be explicitly enabled) + # These are also "excluded by default" + for match in re.finditer(r"%bcond_with\s+(\w+)", spec_content): + excluded.add(match.group(1)) + + # Check for ExcludeArch which excludes entire package on certain architectures + if re.search(r"^ExcludeArch:", spec_content, re.MULTILINE): + # Note: We can't determine current arch, but flag that exclusions exist + pass + + return sorted(built), sorted(excluded) + + +def _extract_kernel_config(source_path: Path) -> dict[str, str]: + """Extract kernel CONFIG_* options from .config file. + + Only applicable to kernel packages. Looks for .config file + in the source tree and parses CONFIG_*=y/n/m lines. + + Args: + source_path: Path to the kernel source directory + + Returns: + Dict mapping CONFIG_* names to their values (y/n/m) + """ + config: dict[str, str] = {} + + # Look for .config file in common locations + config_paths = [ + source_path / ".config", + source_path / "configs" / ".config", + ] + + # Also check for arch-specific configs + for arch_config in source_path.glob("configs/kernel-*.config"): + config_paths.append(arch_config) + + for config_path in config_paths: + if config_path.exists() and config_path.is_file(): + try: + content = config_path.read_text(encoding="utf-8", errors="replace") + # Parse CONFIG_*=y/n/m lines + for match in re.finditer(r"^(CONFIG_\w+)=(y|n|m)", content, re.MULTILINE): + config[match.group(1)] = match.group(2) + # Also capture "# CONFIG_X is not set" as CONFIG_X=n + for match in re.finditer(r"^# (CONFIG_\w+) is not set", content, re.MULTILINE): + config[match.group(1)] = "n" + # Found and parsed a config file, no need to continue + if config: + logger.info( + "_extract_kernel_config: parsed %d options from %s", + len(config), config_path + ) + break + except OSError as e: + logger.warning("_extract_kernel_config: failed to read %s: %s", config_path, e) + + return config + + +def _is_kernel_package(source_path: Path) -> bool: + """Detect kernel package by presence of Kconfig in source root or linux-* subdir. + + Kernel source trees contain a top-level Kconfig file that defines the + kernel configuration system. This is a reliable marker for kernel packages. + + Args: + source_path: Path to the source directory + + Returns: + True if this appears to be a kernel package + """ + if not source_path or not source_path.exists(): + return False + + # Check source root for Kconfig + if (source_path / "Kconfig").exists(): + return True + + # Check linux-* subdirectory pattern (RHEL kernel structure) + for subdir in source_path.glob("linux-*"): + if subdir.is_dir(): + # Check nested linux-* (e.g., linux-5.14.0/linux-5.14.0.el9/) + for nested in subdir.glob("linux-*"): + if nested.is_dir() and (nested / "Kconfig").exists(): + return True + # Check direct subdir + if (subdir / "Kconfig").exists(): + return True + + return False + + +def _find_kernel_config_file(source_path: Path, arch: str) -> Path | None: + """Find the kernel config file for a specific architecture. + + RHEL kernel packages store config files in the source root with naming + pattern: kernel-{arch}-rhel.config (base flavor) or + kernel-{arch}-{flavor}-rhel.config (debug, rt, etc.) + + Args: + source_path: Path to the source directory + arch: Target architecture (e.g., 'x86_64', 'aarch64') + + Returns: + Path to the config file, or None if not found + """ + if not source_path or not source_path.exists() or not arch: + return None + + # Try base flavor first: kernel-{arch}-rhel.config + config_path = source_path / f"kernel-{arch}-rhel.config" + if config_path.exists(): + logger.info("_find_kernel_config_file: found config at %s", config_path) + return config_path + + # Fallback: any kernel-{arch}*.config + for config in source_path.glob(f"kernel-{arch}*.config"): + if config.is_file(): + logger.info("_find_kernel_config_file: found fallback config at %s", config) + return config + + logger.warning("_find_kernel_config_file: no config found for arch %s in %s", arch, source_path) + return None + + +def _find_kernel_source_root(source_path: Path) -> Path | None: + """Find the actual kernel source tree root containing Kconfig. + + RHEL kernel packages have a nested structure: + source/ + ├── kernel-x86_64-rhel.config (config files at root) + └── linux-5.14.0/ + └── linux-5.14.0.el9/ (actual kernel source here) + └── Kconfig + + Args: + source_path: Path to the source directory + + Returns: + Path to the kernel source root (directory containing Kconfig), or None + """ + if not source_path or not source_path.exists(): + return None + + # Check if Kconfig is directly in source_path + if (source_path / "Kconfig").exists(): + return source_path + + # Check linux-* subdirectory pattern + for subdir in source_path.glob("linux-*"): + if subdir.is_dir(): + # Check nested linux-* first (RHEL pattern) + for nested in subdir.glob("linux-*"): + if nested.is_dir() and (nested / "Kconfig").exists(): + logger.info("_find_kernel_source_root: found at %s", nested) + return nested + # Check direct subdir + if (subdir / "Kconfig").exists(): + logger.info("_find_kernel_source_root: found at %s", subdir) + return subdir + + logger.warning("_find_kernel_source_root: no kernel source root found in %s", source_path) + return None + + +# --------------------------------------------------------------------------- +# Data Harvesting Functions +# --------------------------------------------------------------------------- + + +async def harvest_build_data( + build_log_path: Path | None, + spec_path: Path | None, + cwe_id: str | None = None, + source_path: Path | None = None, + package_name: str | None = None, + arch: str | None = None, +) -> BuildHarvestReport: + """Extract structured data from build log and spec file. + + Parses: + - Feature-disabling -D defines (e.g., -DOPENSSL_NO_SM2, -DNO_GZIP) + - Feature-enabling -D defines (e.g., -DENABLE_LDAP, -DLDAP_ENABLED) + - Linked libraries from -l flags + - Subpackages from spec %package directives + - Kernel CONFIG_* options (for kernel packages only) + + For kernel packages, uses CONFIG_* based compilation checking instead of + build log analysis (kernel builds use make -s which hides individual compilations). + + Args: + build_log_path: Path to the build log file + spec_path: Path to the RPM spec file + cwe_id: CWE identifier to look up expected hardening flags (e.g., 'CWE-121') + source_path: Path to the source directory (for kernel config extraction) + package_name: Name of the package (to detect kernel packages) + arch: Target architecture (e.g., 'x86_64') for kernel config file selection + + Returns: + BuildHarvestReport with harvested data and expected hardening flags + """ + from exploit_iq_commons.utils.hardening_kb import HardeningKB + from vuln_analysis.tools.source_inspector import SourceInspector + + # Detect kernel package and find kernel-specific paths + is_kernel = _is_kernel_package(source_path) if source_path else False + kernel_config_path: str | None = None + kernel_source_root: str | None = None + + if is_kernel: + logger.info("harvest_build_data: detected kernel package") + # Find kernel config file for the target architecture + if arch: + config_file = _find_kernel_config_file(source_path, arch) + if config_file: + kernel_config_path = str(config_file) + # Find kernel source root (contains Kconfig, Makefiles) + source_root = _find_kernel_source_root(source_path) + if source_root: + kernel_source_root = str(source_root) + + # Handle case where build_log_path is a directory instead of a file + if build_log_path and build_log_path.is_dir(): + log_files = list(build_log_path.glob("*-build.log")) or list(build_log_path.glob("*.log")) + if log_files: + build_log_path = log_files[0] + logger.info("harvest_build_data: resolved build log directory to file: %s", build_log_path) + else: + logger.warning("harvest_build_data: build_log_path is a directory but no .log files found") + build_log_path = None + + # Lookup expected hardening flags from KB based on CWE + expected_hardening = [] + if cwe_id: + kb = HardeningKB.get_instance() + expected_hardening = kb.lookup_by_cwe(cwe_id) + logger.info( + "harvest_build_data: CWE %s maps to %d hardening flags", + cwe_id, + len(expected_hardening), + ) + + # Extract feature-disabling defines from build log + disabled_features: list[str] = [] + if build_log_path: + inspector = SourceInspector(build_log_path.parent) + + # Grep for lines containing -D defines + matches = inspector.grep_content(r"-D\w+", file_path=build_log_path) + + # Extract unique defines from matched lines + all_defines: set[str] = set() + for match in matches: + defines = re.findall(r"-D(\w+)", match.line_content) + all_defines.update(defines) + + # Filter for feature-disabling patterns: + # - NO_* prefix (e.g., NO_GZIP) + # - DISABLE_* prefix (e.g., DISABLE_SSL) + # - WITHOUT_* prefix (e.g., WITHOUT_FEATURE) + # - *_NO_* infix (e.g., OPENSSL_NO_SM2) + # - *_DISABLE_* infix + # - *_DISABLED suffix + disable_pattern = re.compile( + r"^(NO_|DISABLE_|WITHOUT_)|(_NO_|_DISABLE_)|(_DISABLED$)" + ) + disabled_features = sorted( + d for d in all_defines if disable_pattern.search(d) + ) + + if disabled_features: + logger.info( + "harvest_build_data: found %d disabled features in build log", + len(disabled_features), + ) + + # Filter for feature-enabling patterns: + # - ENABLE_* prefix (e.g., ENABLE_LDAP) + # - *_ENABLED suffix (e.g., LDAP_ENABLED) + enable_pattern = re.compile(r"^ENABLE_|_ENABLED$") + enabled_features = sorted( + d for d in all_defines if enable_pattern.search(d) + ) + + if enabled_features: + logger.info( + "harvest_build_data: found %d enabled features in build log", + len(enabled_features), + ) + else: + enabled_features = [] + + # Extract linked libraries from build log (skip for kernel - not meaningful) + # Note: Architecture detection removed - use target_package.arch instead (checked in L1) + build_architecture: Literal["32-bit", "64-bit", "unknown"] = "unknown" + linked_libraries: list[str] = [] + if build_log_path and not is_kernel: + try: + build_log_content = build_log_path.read_text(encoding="utf-8", errors="replace") + linked_libraries = _extract_linked_libraries(build_log_content) + if linked_libraries: + logger.info( + "harvest_build_data: found %d linked libraries", + len(linked_libraries), + ) + except OSError as e: + logger.warning("harvest_build_data: failed to read build log: %s", e) + + # Extract %build section and features from spec file + spec_build_section = "" + spec_disabled_features: list[str] = [] + spec_enabled_features: list[str] = [] + built_subpackages: list[str] = [] + excluded_subpackages: list[str] = [] + if spec_path and spec_path.exists(): + spec_build_section = _extract_spec_build_section(spec_path) + spec_disabled_features = _extract_spec_disabled_features(spec_build_section) + spec_enabled_features = _extract_spec_enabled_features(spec_build_section) + + if spec_disabled_features: + logger.info( + "harvest_build_data: found %d disabled features in spec", + len(spec_disabled_features), + ) + if spec_enabled_features: + logger.info( + "harvest_build_data: found %d enabled features in spec", + len(spec_enabled_features), + ) + + # Extract subpackages from full spec content + try: + spec_content = spec_path.read_text(encoding="utf-8", errors="replace") + built_subpackages, excluded_subpackages = _extract_spec_subpackages(spec_content) + if built_subpackages: + logger.info( + "harvest_build_data: found %d built subpackages", + len(built_subpackages), + ) + if excluded_subpackages: + logger.info( + "harvest_build_data: found %d excluded subpackages", + len(excluded_subpackages), + ) + except OSError as e: + logger.warning("harvest_build_data: failed to read spec for subpackages: %s", e) + + # Extract kernel config if this is a kernel package + # Note: kernel_config dict is kept for backward compatibility but the primary + # mechanism for kernel is now CONFIG lookup via kernel_config_path + kernel_config: dict[str, str] = {} + if is_kernel and source_path and source_path.exists(): + kernel_config = _extract_kernel_config(source_path) + if kernel_config: + logger.info( + "harvest_build_data: extracted %d kernel config options", + len(kernel_config), + ) + + return BuildHarvestReport( + disabled_features=disabled_features, + spec_disabled_features=spec_disabled_features, + enabled_features=enabled_features, + spec_enabled_features=spec_enabled_features, + expected_hardening=expected_hardening, + build_architecture=build_architecture, + linked_libraries=linked_libraries, + built_subpackages=built_subpackages, + excluded_subpackages=excluded_subpackages, + kernel_config=kernel_config, + is_kernel=is_kernel, + kernel_config_path=kernel_config_path, + kernel_source_root=kernel_source_root, + ) + diff --git a/src/vuln_analysis/functions/code_agent_graph_defs.py b/src/vuln_analysis/functions/code_agent_graph_defs.py new file mode 100644 index 000000000..bbec8a79d --- /dev/null +++ b/src/vuln_analysis/functions/code_agent_graph_defs.py @@ -0,0 +1,1688 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Graph definitions for the L1 Package Code Agent. + +Houses the LangGraph state schema, structured-output schemas for +DownstreamSearchReport/UpstreamSearchReport pipelines, and CodeAgentReport. +Prompt templates are in vuln_analysis.utils.rpm_checker_prompts. +""" + +from __future__ import annotations + +import logging +import re +import shutil +import subprocess +import warnings +from dataclasses import dataclass +from pathlib import Path +from typing import Literal, NotRequired, TYPE_CHECKING + +if TYPE_CHECKING: + from langchain_core.language_models import BaseChatModel + from vuln_analysis.tools.brew_downloader import BrewDownloader + +import aiohttp +from langchain_core.messages import HumanMessage, SystemMessage +from langgraph.graph import MessagesState +from pydantic import BaseModel, Field +from unidiff import PatchSet + +logger = logging.getLogger(__name__) + +from exploit_iq_commons.data_models.checker_status import L2BuildResult, VulnerabilityIntel +from exploit_iq_commons.data_models.common import TargetPackage +from vuln_analysis.functions.react_internals import CheckerThought, Observation, L1VerdictExtraction +from vuln_analysis.utils.rpm_checker_prompts import ( + L1_VERDICT_EXTRACTION_PROMPT, + VULNERABILITY_INTEL_EXTRACTION_PROMPT, + CODE_AGENT_REPORT_PROMPT, + L1_AGENT_SYS_PROMPT_PATCH_AVAILABLE, + L1_AGENT_SYS_PROMPT_UPSTREAM_PATCH, + L1_AGENT_SYS_PROMPT_REBASE_FIX, + L1_AGENT_SYS_PROMPT_REBASE_NO_PATCH, + L1_AGENT_PROMPT_TEMPLATE, + L1_AGENT_PROMPT_TEMPLATE_NO_PATCH, + L1_AGENT_THOUGHT_INSTRUCTIONS, + L1_AGENT_THOUGHT_UPSTREAM_INSTRUCTIONS, + L1_AGENT_THOUGHT_REBASE_INSTRUCTIONS, + L1_AGENT_THOUGHT_CVE_DESC_INSTRUCTIONS, + L1_COMPREHENSION_PROMPT, + L1_MEMORY_UPDATE_PROMPT, + L1_COMPREHENSION_PROMPT_CVE_DESC, + L1_MEMORY_UPDATE_PROMPT_CVE_DESC, +) + +# --------------------------------------------------------------------------- +# Graph state +# --------------------------------------------------------------------------- + + +class CodeAgentState(MessagesState): + """LangGraph state for the L1 Code Agent (DownstreamSearch -> UpstreamSearch).""" + downstream_report: NotRequired[DownstreamSearchReport | None] + upstream_report: NotRequired[UpstreamSearchReport | None] + runtime_prompt: NotRequired[str | None] + last_thought: NotRequired[CheckerThought | None] + step: NotRequired[int] + max_steps: NotRequired[int] + output: NotRequired[str] + thought: NotRequired[CheckerThought | None] + observation: NotRequired[Observation | None] + vulnerability_intel: NotRequired["VulnerabilityIntel | None"] + arch_mismatch_reason: NotRequired[str | None] + + +# --------------------------------------------------------------------------- +# Patch schemas (must be defined before reports that use them) +# --------------------------------------------------------------------------- + + +class PatchHunk(BaseModel): + """A single hunk from a downstream patch file.""" + source_start: int + source_length: int + target_start: int + target_length: int + context_lines: list[str] = Field(default_factory=list, description="Unchanged lines") + removed_lines: list[str] = Field(default_factory=list, description="Deleted lines (- stripped)") + added_lines: list[str] = Field(default_factory=list, description="Added lines (+ stripped)") + + +class PatchFile(BaseModel): + """Changes to a single file in a downstream patch.""" + source_path: str + target_path: str + hunks: list[PatchHunk] + is_new_file: bool = False + is_deleted_file: bool = False + + +class ParsedPatch(BaseModel): + """Structured representation of a downstream patch file.""" + patch_filename: str + files: list[PatchFile] + + +class OSVPatchResult(BaseModel): + """Result of fetching a patch from OSV/GitHub or intel references.""" + cve_id: str + fixed_commit: str + repo_url: str + patch_url: str + patch_content: str | None = Field(default=None, description="Raw .patch text") + parsed_patch: "ParsedPatch | None" = Field(default=None, description="Structured patch data") + commit_message: str | None = None + commit_author: str | None = None + commit_date: str | None = None + source: str | None = Field(default=None, description="Source provider (ghsa/nvd/rhsa/ubuntu_patches/osv)") + url_type: str | None = Field(default=None, description="URL type (commit/pull)") + platform: str | None = Field(default=None, description="Hosting platform (github/kernel.org)") + + +@dataclass +class SpecPatchMatch: + """Result of finding a CVE patch in a spec file.""" + patch_index: int + patch_filename: str + raw_directive: str + comment_block: str | None + match_source: Literal["filename", "comment"] + + +# --------------------------------------------------------------------------- +# Reflection schemas +# --------------------------------------------------------------------------- +class DownstreamSearchReport(BaseModel): + """Result of a downstream search.""" + is_patch_file_available: bool = Field(default=False, description="True if a patch file is available") + patch_file_name: str = Field(default="", description="The name of the patch file") + is_patch_in_spec_file: bool = Field(default=False, description="True if a patch file is in the spec file") + spec_file_log_change: str = Field( + default="", + description="All lines in the .spec file that match a grep for the CVE id (not changelog-only)", + ) + is_patch_applied_in_build: bool = Field(default=False, description="True if a patch file is applied in the build") + build_log_patch_applied: str = Field(default="", description="The patch applied in the build log") + spec_patch_directives_for_cve: list[str] = Field( + default_factory=list, + description="Raw PatchN: lines from the spec whose patch filename token matches this CVE", + ) + spec_changelog_cve_lines: str = Field( + default="", + description="Lines from the %changelog section of the .spec that mention the CVE", + ) + spec_source0_line: str = Field( + default="", + description="The Source0: line from the spec file (upstream tarball reference)", + ) + spec_version_line: str = Field( + default="", + description="The Version: line from the spec file", + ) + parsed_patch: ParsedPatch | None = Field(default=None, description="The parsed patch file") + + +class UpstreamSearchReport(BaseModel): + """Result of an upstream search.""" + + is_fixed_srpm_is_needed: bool = Field(default=False, description="True if a fixed SRPM is needed downstream style patch files") + fixed_srpm_file_name: str = Field(default="", description="The name of the fixed SRPM file") + fixed_parsed_patch: ParsedPatch | None = Field(default=None, description="The parsed fixed SRPM patch file") + reference_package_nvr: str = Field( + default="", + description="NVR (name-version-release) of the reference fixed package from intel", + ) + reason_cve_code: str = Field( + default="", + description="Does the CVE description match the code which is vulnerable", + ) + is_code_fixed_by_rebase: Literal["yes", "no", "unknown"] = Field( + default="unknown", + description="yes if the code is fixed by rebase", + ) + spec_file_log_change: str = Field( + default="", + description="The log change of patch in the spec file", + ) + spec_fixed_srpm_change: str = Field( + default="", + description="The change of the fixed SRPM in the spec file", + ) + reason_code_fixed_by_rebase: str = Field( + default="", + description="The reason why the code is fixed by rebase", + ) + osv_result: OSVPatchResult | None = Field(default=None, description="The result of the OSV patch retrieval") + + + + + +class ReflectionBase(BaseModel): + """Base schema for phase reports. + + Subclasses add phase-specific fields on top. + """ + instructions: str = Field( + description="Guidance to the generator for the next iteration.") + is_sufficient: bool = Field( + description="True if results are good enough to proceed.") + + +# --------------------------------------------------------------------------- +# Code Agent Report schema +# --------------------------------------------------------------------------- + + +class CodeSnippet(BaseModel): + """A code snippet from the investigation.""" + file_path: str = Field(description="Path to the source file") + line_number: int | None = Field(default=None, description="Starting line number") + code: str = Field(description="The code content") + snippet_type: str = Field( + description="Type of snippet: one of 'vulnerable', 'fix', or 'context'") + source: str = Field( + description="Where this snippet came from: one of 'downstream_patch', 'upstream_patch', or 'source_search'") + + +class CodeAgentReport(BaseModel): + """Final L1 Code Agent investigation report synthesizing all phases.""" + justification_label: str = Field( + description=( + "Justification category aligned with VEX: one of " + "code_not_present, protected_by_mitigating_control, vulnerable, uncertain" + )) + executive_summary: str = Field( + description=( + "3-4 sentence synthesis. Must include: 1) Final verdict, " + "2) Technical nature of flaw, 3) Why L2 context overrides L1 (if applicable)." + )) + evidence_chain: list[str] = Field( + description="Ordered list of evidence items tracing the vulnerability through phases") + affected_files: list[str] = Field( + description="Source files where vulnerable code was identified") + patch_analysis: str | None = Field( + default=None, + description="Analysis of downstream patches if any were found") + code_snippets: list[CodeSnippet] = Field( + default_factory=list, + description="Structured code snippets showing vulnerable and fix code") + caveats: list[str] = Field( + default_factory=list, + description="Investigation gaps or uncertainties that may need manual review") + + def to_markdown( + self, + vuln_id: str = "", + target_package: str = "", + version: str = "", + release: str = "", + downstream_report: DownstreamSearchReport | None = None, + ) -> str: + """Render the report as a formatted markdown string.""" + lines: list[str] = [] + + # Header with title + lines.append("# L1 Code Agent Investigation Report") + lines.append("") + + # Verdict banner based on justification label + verdict_map = { + "protected_by_mitigating_control": ("NOT VULNERABLE", "Protected by downstream patch"), + "protected_by_compiler": ("NOT VULNERABLE", "Protected by compiler hardening"), + "code_not_present": ("NOT VULNERABLE", "Vulnerable code not present"), + "code_not_reachable": ("NOT VULNERABLE", "Vulnerable code not reachable"), + "requires_environment": ("NOT VULNERABLE", "Requires specific environment"), + "vulnerable": ("VULNERABLE", "Package requires patching"), + "uncertain": ("UNCERTAIN", "Requires manual review"), + } + verdict_status, verdict_desc = verdict_map.get( + self.justification_label, + ("UNKNOWN", "Unknown status") + ) + + lines.append(f"> **Verdict: {verdict_status}** - {verdict_desc}") + lines.append("") + + # Package information table + lines.append("## Package Information") + lines.append("") + lines.append("| Field | Value |") + lines.append("|-------|-------|") + if vuln_id: + lines.append(f"| **CVE ID** | `{vuln_id}` |") + if target_package: + lines.append(f"| **Package** | `{target_package}` |") + if version: + version_str = f"{version}-{release}" if release else version + lines.append(f"| **Version** | `{version_str}` |") + lines.append(f"| **Justification** | `{self.justification_label}` |") + lines.append("") + + # Executive Summary + lines.append("---") + lines.append("") + lines.append("## Executive Summary") + lines.append("") + lines.append(self.executive_summary) + lines.append("") + + # Evidence Chain + lines.append("---") + lines.append("") + lines.append("## Evidence Chain") + lines.append("") + lines.extend(_format_interleaved_evidence( + self.evidence_chain, + downstream_report, + )) + + # Affected Files + if self.affected_files: + lines.append("---") + lines.append("") + lines.append("## Affected Files") + lines.append("") + # Separate source files from test files + source_files = [f for f in self.affected_files if "/test/" not in f and "test_" not in f] + test_files = [f for f in self.affected_files if "/test/" in f or "test_" in f] + + if source_files: + lines.append("**Source files:**") + for f in source_files: + lines.append(f"- `{f}`") + lines.append("") + + if test_files: + lines.append("**Test files:**") + for f in test_files: + lines.append(f"- `{f}`") + lines.append("") + + # Patch Analysis + if self.patch_analysis: + lines.append("---") + lines.append("") + lines.append("## Patch Analysis") + lines.append("") + lines.append(self.patch_analysis) + lines.append("") + + # Code Snippets - separate vulnerable from fix, prioritize main source files + if self.code_snippets: + lines.append("---") + lines.append("") + lines.append("## Code Comparison") + lines.append("") + + # Filter and organize snippets + vuln_snippets = [s for s in self.code_snippets if s.snippet_type == "vulnerable"] + fix_snippets = [s for s in self.code_snippets if s.snippet_type == "fix"] + + # Prioritize main source files (not test/build files) + def is_main_source(path: str) -> bool: + return "/test/" not in path and "test_" not in path and "Makefile" not in path and "CMakeLists" not in path + + main_vuln = [s for s in vuln_snippets if is_main_source(s.file_path)] + main_fix = [s for s in fix_snippets if is_main_source(s.file_path)] + + # Show main vulnerability code + if main_vuln: + lines.append("### Vulnerable Code") + lines.append("") + for snippet in main_vuln[:2]: + file_name = snippet.file_path.split("/")[-1] + lines.append(f"**File:** `{file_name}` (Line {snippet.line_number or 'N/A'})") + lines.append("") + lines.append("```c") + lines.append(snippet.code.strip()) + lines.append("```") + lines.append("") + + # Show fix code + if main_fix: + lines.append("### Fix Code") + lines.append("") + for snippet in main_fix[:2]: + file_name = snippet.file_path.split("/")[-1] + lines.append(f"**File:** `{file_name}` (Line {snippet.line_number or 'N/A'})") + lines.append("") + lines.append("```c") + lines.append(snippet.code.strip()) + lines.append("```") + lines.append("") + + # Show other snippets (test/build files) in collapsible section if any + other_vuln = [s for s in vuln_snippets if not is_main_source(s.file_path)] + other_fix = [s for s in fix_snippets if not is_main_source(s.file_path)] + + if other_vuln or other_fix: + lines.append("
") + lines.append("Additional Changes (Test/Build Files)") + lines.append("") + for snippet in other_vuln + other_fix: + file_name = snippet.file_path.split("/")[-1] + lines.append(f"**{snippet.snippet_type.title()}** - `{file_name}`") + lines.append("") + lines.append("```") + lines.append(snippet.code.strip()) + lines.append("```") + lines.append("") + lines.append("
") + lines.append("") + + # Caveats + if self.caveats: + lines.append("---") + lines.append("") + lines.append("## Caveats") + lines.append("") + for caveat in self.caveats: + lines.append(f"- {caveat}") + lines.append("") + + # Footer + lines.append("---") + lines.append("") + lines.append("*Report generated by L1 Code Agent*") + + return "\n".join(lines) + + +def format_patch_data_for_intel( + parsed_patch: ParsedPatch | None +) -> str: + """Format patch and CVE data for intelligence extraction. + + Parameters + ---------- + parsed_patch: + Parsed patch file structure (may be None if no patch available). + cve_description: + CVE description text from advisories. + + Returns + ------- + str + Formatted string suitable for the VULNERABILITY_INTEL_EXTRACTION_PROMPT. + """ + if not parsed_patch: + return "" + + lines = [f"Patch: {parsed_patch.patch_filename}", ""] + for pf in parsed_patch.files: + lines.append(f"File: {pf.target_path}") + for hunk in pf.hunks: + if hunk.removed_lines: + lines.append(" Removed (vulnerable):") + for line in hunk.removed_lines[:10]: + lines.append(f" - {line}") + if len(hunk.removed_lines) > 10: + lines.append(f" ... (+{len(hunk.removed_lines) - 10} more lines)") + if hunk.added_lines: + lines.append(" Added (fix):") + for line in hunk.added_lines[:10]: + lines.append(f" + {line}") + if len(hunk.added_lines) > 10: + lines.append(f" ... (+{len(hunk.added_lines) - 10} more lines)") + lines.append("") + + return "\n".join(lines) + + +def get_relevant_hunks(parsed_patch: ParsedPatch | None, grep_query: str) -> str: + """Extract unified diff hunks for files matching the grep target. + + Parameters + ---------- + parsed_patch: + Parsed patch file structure (may be None if no patch available). + grep_query: + The grep query string, which may include a file filter (e.g., "pattern,filename.c"). + + Returns + ------- + str + Unified diff format string with relevant hunks, or empty string if no patch/match. + """ + if not parsed_patch: + return "" + + file_pattern = None + if "," in grep_query: + file_pattern = grep_query.split(",")[-1].strip() + + hunks = [] + for pf in parsed_patch.files: + if file_pattern and file_pattern not in pf.target_path: + continue + hunks.append(f"--- a/{pf.target_path}") + hunks.append(f"+++ b/{pf.target_path}") + for hunk in pf.hunks: + for line in hunk.removed_lines: + hunks.append(f"-\t{line}") + for line in hunk.added_lines: + hunks.append(f"+\t{line}") + + return "\n".join(hunks) if hunks else "" + + +# --------------------------------------------------------------------------- +# Report formatting helpers +# --------------------------------------------------------------------------- + +MAX_SNIPPET_CHARS = 500 +L1_EXTRACTED_FACTS_EXCERPT_CHARS = 2000 + + +def _cap_text_excerpt(text: str, max_chars: int) -> tuple[str, bool]: + """Return (possibly truncated) text and whether truncation occurred.""" + t = text.strip() + if len(t) <= max_chars: + return t, False + return t[: max_chars] + "\n[… truncated …]", True + + +def _format_interleaved_evidence( + evidence_chain: list[str], + downstream_report: DownstreamSearchReport | None, + *, + max_excerpt: int = L1_EXTRACTED_FACTS_EXCERPT_CHARS, +) -> list[str]: + """Build audit-ready markdown for the Evidence Chain section. + + Structure follows the 3-pillar model for TARGET package verification: + - Status Summary table for at-a-glance verification + - Target Patch Metadata (the "What") + - Integration Evidence (the "Plan" - spec file directives) + - Execution Evidence (the "Action" - build logs) + - Source Validation (the "Result" - L1 agent findings) + """ + lines: list[str] = [] + + if downstream_report is None: + for ev in evidence_chain: + lines.append(f"- {ev}") + return lines + + d = downstream_report + + # Categorize evidence items by keywords + patch_evidence: list[str] = [] + build_evidence: list[str] = [] + code_evidence: list[str] = [] + other_evidence: list[str] = [] + + patch_keywords = ("patch", "spec", "patchn", "directive", "target", "reference") + build_keywords = ("build", "applied", "log") + code_keywords = ("code", "function", "vulnerable", "fix", "found", "source", "l1", "agent") + + for ev in evidence_chain: + ev_lower = ev.lower() + if any(kw in ev_lower for kw in patch_keywords): + patch_evidence.append(ev) + elif any(kw in ev_lower for kw in build_keywords): + build_evidence.append(ev) + elif any(kw in ev_lower for kw in code_keywords): + code_evidence.append(ev) + else: + other_evidence.append(ev) + + # Status Summary - at-a-glance verification of TARGET package (using bullets for UI compatibility) + lines.append("### Status Summary (Target Package)") + lines.append("") + patch_check = "PASS" if d.is_patch_file_available else "FAIL" + spec_check = "PASS" if d.is_patch_in_spec_file else "FAIL" + build_check = "PASS" if d.is_patch_applied_in_build else "FAIL" + lines.append(f"- **Target patch file exists:** {patch_check}") + lines.append(f"- **Referenced in target spec:** {spec_check}") + lines.append(f"- **Applied in target build:** {build_check}") + lines.append("") + + # Section 1: Target Patch Metadata + if d.patch_file_name or patch_evidence: + lines.append("### 1. Patch Metadata") + lines.append("") + if d.patch_file_name: + lines.append(f"- **Target patch file:** `{d.patch_file_name}`") + for ev in patch_evidence: + lines.append(f"- {ev}") + lines.append("") + + # Section 2: Integration Evidence (Spec File) - the "Plan" + has_integration = d.spec_patch_directives_for_cve or d.spec_changelog_cve_lines.strip() + if has_integration: + lines.append("### 2. Integration Evidence (Spec File)") + lines.append("") + + if d.spec_patch_directives_for_cve: + # Split directives into declaration and application + declarations = [line for line in d.spec_patch_directives_for_cve + if line.strip().startswith("Patch")] + applications = [line for line in d.spec_patch_directives_for_cve + if line.strip().startswith("%patch")] + + if declarations: + lines.append("**Patch declaration:**") + lines.append("") + lines.append("```ini") + lines.append("\n".join(declarations)) + lines.append("```") + lines.append("") + + if applications: + lines.append("**Patch application directive:**") + lines.append("") + lines.append("```ini") + lines.append("\n".join(applications)) + lines.append("```") + lines.append("") + + if d.spec_changelog_cve_lines.strip(): + ex, trunc = _cap_text_excerpt(d.spec_changelog_cve_lines, max_excerpt) + hdr = "**Changelog entry:**" + if trunc: + hdr += " *(truncated)*" + lines.append(hdr) + lines.append("") + lines.append("```ini") + lines.append(ex) + lines.append("```") + lines.append("") + + # Section 3: Execution Evidence (Build Log) - the "Action" + if d.build_log_patch_applied.strip() or build_evidence: + lines.append("### 3. Execution Evidence (Build Log)") + lines.append("") + + for ev in build_evidence: + lines.append(f"- {ev}") + if build_evidence: + lines.append("") + + if d.build_log_patch_applied.strip(): + ex, trunc = _cap_text_excerpt(d.build_log_patch_applied, max_excerpt) + if trunc: + lines.append("**Build output:** *(truncated)*") + else: + lines.append("**Build output:**") + lines.append("") + lines.append("```bash") + lines.append(ex) + lines.append("```") + lines.append("") + + # Section 4: Source Validation - the "Result" + if code_evidence: + lines.append("### 4. Source Validation") + lines.append("") + for ev in code_evidence: + lines.append(f"- {ev}") + lines.append("") + + # Section 5: Tarball Reference + if d.spec_version_line or d.spec_source0_line: + lines.append("### 5. Tarball Reference") + lines.append("") + if d.spec_version_line: + lines.append(f"- `{d.spec_version_line}`") + if d.spec_source0_line: + lines.append(f"- `{d.spec_source0_line}`") + lines.append("") + + # Additional evidence (uncategorized) + if other_evidence: + lines.append("### Additional Evidence") + lines.append("") + for ev in other_evidence: + lines.append(f"- {ev}") + lines.append("") + + return lines + + +def _format_extracted_facts_section( + d: DownstreamSearchReport, + *, + max_excerpt: int = L1_EXTRACTED_FACTS_EXCERPT_CHARS, +) -> list[str]: + """Build markdown lines for the deterministic *Extracted facts* block. + + .. deprecated:: + This function is deprecated. Use `_format_interleaved_evidence()` instead, + which merges Evidence Chain and Extracted facts into a single interleaved + section for better readability. + """ + warnings.warn( + "_format_extracted_facts_section is deprecated. " + "Use _format_interleaved_evidence() instead.", + DeprecationWarning, + stacklevel=2, + ) + lines: list[str] = [ + "## Extracted facts", + "", + "*Verbatim excerpts from spec/build grep and parsers. Narrative sections below are model-generated.*", + "", + ] + lines.append(f"- **Downstream patch file found:** {d.is_patch_file_available}") + if d.patch_file_name: + lines.append(f"- **Patch file name:** `{d.patch_file_name}`") + lines.append(f"- **Patch referenced in spec (CVE grep):** {d.is_patch_in_spec_file}") + lines.append(f"- **Build log shows CVE / patch application:** {d.is_patch_applied_in_build}") + lines.append("") + + if d.spec_patch_directives_for_cve: + lines.append("**Spec `PatchN:` line(s) whose patch filename contains this CVE:**") + block = "\n".join(d.spec_patch_directives_for_cve) + lines.extend(["", "```", block, "```", ""]) + else: + lines.extend(["**Spec `PatchN:` line(s) whose patch filename contains this CVE:** *None found*", ""]) + + if d.spec_changelog_cve_lines.strip(): + ex, trunc = _cap_text_excerpt(d.spec_changelog_cve_lines, max_excerpt) + sub = f" (truncated to ~{max_excerpt} chars)" if trunc else "" + lines.append(f"**%changelog line(s) mentioning this CVE:**{sub}") + lines.extend(["", "```", ex, "```", ""]) + else: + lines.extend(["**%changelog line(s) mentioning this CVE:** *No matching lines* ", ""]) + + if d.spec_file_log_change.strip(): + ex, trunc = _cap_text_excerpt(d.spec_file_log_change, max_excerpt) + hdr = "**All spec lines matching CVE grep (may include Patch, changelog, comments):**" + if trunc: + hdr += f" *({max_excerpt} char excerpt)*" + lines.append(hdr) + lines.extend(["", "```", ex, "```", ""]) + else: + lines.extend(["**All spec lines matching CVE grep:** *None*", ""]) + + if d.build_log_patch_applied.strip(): + ex, trunc = _cap_text_excerpt(d.build_log_patch_applied, max_excerpt) + hdr = "**Build log line(s) matching CVE grep:**" + if trunc: + hdr += f" *({max_excerpt} char excerpt)*" + lines.append(hdr) + lines.extend(["", "```", ex, "```", ""]) + else: + lines.extend(["**Build log line(s) matching CVE grep:** *None or build log not available* ", ""]) + + # Spec tarball reference (Source0/Version) for delivery-model context + if d.spec_version_line or d.spec_source0_line: + lines.append("**Spec tarball reference:**") + if d.spec_version_line: + lines.append(f"- `{d.spec_version_line}`") + if d.spec_source0_line: + lines.append(f"- `{d.spec_source0_line}`") + lines.append("") + + return lines + + +def _format_downstream_for_report(report: DownstreamSearchReport | None) -> str: + """Format target package analysis results for prompt injection. + + This section reports whether the TARGET package (the one being scanned) + contains a CVE-specific patch file. + """ + if report is None: + return "Target package analysis did not produce results." + + lines = [] + lines.append(f"**Target Package Patch Available:** {report.is_patch_file_available}") + + if report.is_patch_file_available: + lines.append(f"**Target Patch File:** `{report.patch_file_name}`") + lines.append(f"**Referenced in Spec:** {report.is_patch_in_spec_file}") + if report.spec_file_log_change: + lines.append(f"**Target Spec Changelog:**\n```\n{report.spec_file_log_change[:500]}\n```") + lines.append(f"**Applied in Build:** {report.is_patch_applied_in_build}") + if report.build_log_patch_applied: + lines.append(f"**Build Log Evidence:**\n```\n{report.build_log_patch_applied[:500]}\n```") + + if report.parsed_patch: + lines.append(f"\n**Parsed Patch ({len(report.parsed_patch.files)} files):**") + for pf in report.parsed_patch.files[:5]: + added = sum(len(h.added_lines) for h in pf.hunks) + removed = sum(len(h.removed_lines) for h in pf.hunks) + lines.append(f"- `{pf.target_path}` (+{added}/-{removed} lines)") + if len(report.parsed_patch.files) > 5: + lines.append(f" (+{len(report.parsed_patch.files) - 5} more files)") + else: + lines.append("No CVE-specific patch file found in target package.") + + return "\n".join(lines) + + +def _format_upstream_for_report(report: UpstreamSearchReport | None) -> str: + """Format reference intel gathering results for prompt injection. + + This section reports TWO distinct pieces of information: + 1. Rebase indicator: Checked TARGET's spec file for CVE mention + 2. Reference package: Downloaded a known-fixed package from intel to extract patch patterns + """ + if report is None: + return "Reference intel gathering did not produce results." + + lines = [] + + # Part 1: Rebase indicator (checked in TARGET's spec file) + rebase_status = report.is_code_fixed_by_rebase + if rebase_status == "unknown": + lines.append("**Target Rebase Indicator:** not found (no CVE mention in target's spec file)") + elif rebase_status == "yes": + lines.append("**Target Rebase Indicator:** found (CVE mentioned in target's spec changelog)") + else: + lines.append(f"**Target Rebase Indicator:** {rebase_status}") + + if report.spec_file_log_change: + lines.append(f"**Target Spec Changelog Match:**\n```\n{report.spec_file_log_change[:500]}\n```") + + # Part 2: Reference package (downloaded from intel for comparison) + if report.is_fixed_srpm_is_needed: + if report.reference_package_nvr: + lines.append(f"**Reference Fixed Package:** `{report.reference_package_nvr}` (from intel)") + else: + lines.append(f"**Reference Fixed Package:** Available (from intel)") + lines.append(f"**Reference Patch File:** `{report.fixed_srpm_file_name}`") + if report.fixed_parsed_patch: + lines.append(f"\n**Reference Patch ({len(report.fixed_parsed_patch.files)} files):**") + for pf in report.fixed_parsed_patch.files[:5]: + added = sum(len(h.added_lines) for h in pf.hunks) + removed = sum(len(h.removed_lines) for h in pf.hunks) + lines.append(f"- `{pf.target_path}` (+{added}/-{removed} lines)") + + + if report.reason_code_fixed_by_rebase: + lines.append(f"\n**Rebase Reasoning:** {report.reason_code_fixed_by_rebase}") + + return "\n".join(lines) + + +def _build_agent_status_line(l2_result: L2BuildResult | None) -> str: + """Explicit build-agent run state for the report LLM (avoids inferring from empty XML).""" + if l2_result is None: + return "Build agent status: not_run" + if l2_result.l2_override_verdict is not None: + return f"Build agent status: ran_with_override ({l2_result.l2_override_verdict})" + return "Build agent status: ran_no_override (code agent verdict stands)" + + +def _format_l2_for_report(l2_result: L2BuildResult | None) -> str: + """Format build agent results for prompt injection. + + Emits context whenever L2 ran, including when there is no override verdict. + """ + if l2_result is None: + return "" + + override = l2_result.l2_override_verdict or "none" + lines = [ + "", + f"**Override verdict:** {override}", + f"**Compilation status:** {l2_result.compilation_status}", + ] + + if l2_result.compilation_evidence: + lines.append(f"**Compilation evidence:** {l2_result.compilation_evidence}") + + if l2_result.hardening_flags: + flags_str = ", ".join(l2_result.hardening_flags[:10]) + if len(l2_result.hardening_flags) > 10: + flags_str += f" (+{len(l2_result.hardening_flags) - 10} more)" + lines.append(f"**Hardening flags:** {flags_str}") + + if l2_result.hardening_rationale: + lines.append(f"**Hardening rationale:** {l2_result.hardening_rationale}") + + if l2_result.hardening_relevant is not None: + lines.append(f"**Hardening relevant to CVE:** {l2_result.hardening_relevant}") + + if l2_result.evidence_sources: + lines.append(f"**Evidence sources:** {', '.join(l2_result.evidence_sources)}") + + lines.append("") + + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Report generation pipeline +# --------------------------------------------------------------------------- + +MAX_REPORT_CODE_SNIPPETS_VULNERABLE = 3 +MAX_REPORT_CODE_SNIPPETS_FIX = 3 + + +def _normalize_snippet_path(path: str) -> str: + """Stable comparison key for patch vs affected file paths.""" + p = path.strip().replace("\\", "/") + while p.startswith("./"): + p = p[2:] + if p.startswith("ab/"): + p = p[3:] + return p.lower() + + +def _snippet_matches_any_affected_path(snippet_path: str, affected_files: list[str]) -> bool: + if not affected_files: + return False + norm_snip = _normalize_snippet_path(snippet_path) + snip_base = Path(snippet_path).name.lower() + for af in affected_files: + norm_af = _normalize_snippet_path(af) + if norm_snip == norm_af: + return True + if snip_base and snip_base == Path(af).name.lower(): + return True + if norm_snip.endswith(norm_af) or norm_af.endswith(norm_snip): + return True + return False + + +def _rank_patch_snippets_for_relevance( + snippets: list[CodeSnippet], + affected_files: list[str], +) -> list[CodeSnippet]: + """Paths matching affected_files first; preserve original order within each bucket.""" + if not affected_files: + return list(snippets) + indexed = list(enumerate(snippets)) + indexed.sort( + key=lambda pair: ( + 0 if _snippet_matches_any_affected_path(pair[1].file_path, affected_files) else 1, + pair[0], + ), + ) + return [s for _, s in indexed] + + +def _cap_snippets_by_type( + snippets: list[CodeSnippet], + *, + max_vulnerable: int = MAX_REPORT_CODE_SNIPPETS_VULNERABLE, + max_fix: int = MAX_REPORT_CODE_SNIPPETS_FIX, +) -> list[CodeSnippet]: + """Keep insertion order; at most max_vulnerable vulnerable and max_fix fix snippets.""" + n_vuln = n_fix = 0 + out: list[CodeSnippet] = [] + for s in snippets: + if s.snippet_type == "vulnerable": + if n_vuln >= max_vulnerable: + continue + n_vuln += 1 + out.append(s) + elif s.snippet_type == "fix": + if n_fix >= max_fix: + continue + n_fix += 1 + out.append(s) + else: + out.append(s) + return out + + +def _extract_downstream_patch_code_snippets( + downstream_report: DownstreamSearchReport | None, +) -> list[CodeSnippet]: + """Extract vulnerable/fix snippets from the downstream parsed patch only. + + For purely additive patches (no removed lines), shows context lines + as "vulnerable" since they represent the code lacking the fix. + """ + if not downstream_report or not downstream_report.parsed_patch: + return [] + snippets: list[CodeSnippet] = [] + for pf in downstream_report.parsed_patch.files: + for hunk in pf.hunks: + if hunk.removed_lines: + snippets.append(CodeSnippet( + file_path=pf.target_path.lstrip("ab/"), + line_number=hunk.source_start, + code="\n".join(hunk.removed_lines[:10]), + snippet_type="vulnerable", + source="downstream_patch", + )) + elif hunk.context_lines and hunk.added_lines: + snippets.append(CodeSnippet( + file_path=pf.target_path.lstrip("ab/"), + line_number=hunk.source_start, + code="\n".join(hunk.context_lines[:10]), + snippet_type="vulnerable", + source="downstream_patch", + )) + if hunk.added_lines: + snippets.append(CodeSnippet( + file_path=pf.target_path.lstrip("ab/"), + line_number=hunk.target_start, + code="\n".join(hunk.added_lines[:10]), + snippet_type="fix", + source="downstream_patch", + )) + return snippets + + +def _extract_code_snippets( + downstream_report: DownstreamSearchReport | None, + upstream_report: UpstreamSearchReport | None, +) -> list[CodeSnippet]: + """Extract code snippets from parsed patches. + + For purely additive patches (no removed lines), shows context lines + as "vulnerable" since they represent the code lacking the fix. + """ + snippets: list[CodeSnippet] = _extract_downstream_patch_code_snippets(downstream_report) + + if upstream_report and upstream_report.fixed_parsed_patch: + for pf in upstream_report.fixed_parsed_patch.files: + for hunk in pf.hunks: + if hunk.removed_lines: + snippets.append(CodeSnippet( + file_path=pf.target_path.lstrip("ab/"), + line_number=hunk.source_start, + code="\n".join(hunk.removed_lines[:10]), + snippet_type="vulnerable", + source="upstream_patch", + )) + elif hunk.context_lines and hunk.added_lines: + snippets.append(CodeSnippet( + file_path=pf.target_path.lstrip("ab/"), + line_number=hunk.source_start, + code="\n".join(hunk.context_lines[:10]), + snippet_type="vulnerable", + source="upstream_patch", + )) + if hunk.added_lines: + snippets.append(CodeSnippet( + file_path=pf.target_path.lstrip("ab/"), + line_number=hunk.target_start, + code="\n".join(hunk.added_lines[:10]), + snippet_type="fix", + source="upstream_patch", + )) + + return snippets + + +async def extract_l1_verdict( + llm, + vuln_id: str, + target_package: str, + final_answer: str, + tracer, +) -> L1VerdictExtraction: + """Use LLM to extract structured verdict from L1 agent's final answer. + + Parameters + ---------- + llm: + LangChain LLM for verdict extraction. + vuln_id: + CVE identifier (e.g. "CVE-2026-5121"). + target_package: + Name of the package being investigated. + final_answer: + The L1 agent's final answer text. + tracer: + Request-scoped tracing context. + + Returns + ------- + L1VerdictExtraction + Structured verdict with confidence and reasoning. + """ + verdict_llm = llm.with_structured_output(L1VerdictExtraction) + prompt = L1_VERDICT_EXTRACTION_PROMPT.format( + vuln_id=vuln_id, + target_package=target_package, + final_answer=final_answer, + ) + with tracer.push_active_function("extract_l1_verdict", input_data={"vuln_id": vuln_id}) as span: + result = await verdict_llm.ainvoke([SystemMessage(content=prompt)]) + span.set_output({ + "preliminary_verdict": result.preliminary_verdict, + "confidence": result.confidence, + }) + logger.info( + "extract_l1_verdict: verdict=%s confidence=%.2f", + result.preliminary_verdict, result.confidence, + ) + return result + + +async def generate_code_agent_report( + *, + llm, + vuln_id: str, + target_package: str, + descriptions: list[tuple[str, str]], + downstream_report: DownstreamSearchReport | None, + upstream_report: UpstreamSearchReport | None, + l1_agent_answer: str | None, + tracer, + policy_context: str = "", + l2_result: L2BuildResult | None = None, +) -> CodeAgentReport: + """Generate the final L1 Code Agent investigation report. + + Synthesizes results from downstream search, upstream search, L1 agent analysis, + and optionally L2 build analysis into a comprehensive, auditable report with + a clear verdict. + + Parameters + ---------- + llm: + LangChain LLM for report generation. + vuln_id: + CVE identifier (e.g. "CVE-2026-5121"). + target_package: + Name of the package being investigated. + descriptions: + ``(source_name, text)`` pairs from CVE intel. + downstream_report: + Output of downstream search (may be None). + upstream_report: + Output of upstream search (may be None). + l1_agent_answer: + Final answer from the L1 ReAct agent (may be None). + tracer: + Request-scoped tracing context. + policy_context: + Pre-formatted NVR posture and RHSA excerpt context for the LLM prompt. + l2_result: + Output of L2 build analysis (may be None). When present, L2 verdicts + override L1 findings as L2 analyzes actual compiled binaries. + + Returns + ------- + CodeAgentReport + Structured report with verdict, evidence, and recommendations. + """ + from langchain_core.messages import HumanMessage, SystemMessage + + cve_description = "\n".join(f"[{src}] {txt}" for src, txt in descriptions) + + downstream_section = _format_downstream_for_report(downstream_report) + upstream_section = _format_upstream_for_report(upstream_report) + l1_agent_section = l1_agent_answer or "Code agent did not produce a final answer." + l2_context_section = _format_l2_for_report(l2_result) + build_agent_status_section = ( + f"\n{_build_agent_status_line(l2_result)}\n\n" + ) + + # Generate override notice when L2 has an override verdict + if l2_result is not None and l2_result.l2_override_verdict is not None: + override_notice_section = ( + "\n" + f"BUILD AGENT OVERRIDE IN EFFECT: The build agent's verdict ({l2_result.l2_override_verdict}) " + "SUPERSEDES the code agent's source analysis.\n" + "Do NOT use 'vulnerable' as the justification_label. " + "Follow the LABEL SELECTION DECISION TREE in the instructions.\n" + "\n" + ) + else: + override_notice_section = "" + + if policy_context: + policy_context_section = ( + "\n" + + policy_context + + "\n\n" + ) + else: + policy_context_section = "" + + prompt_text = CODE_AGENT_REPORT_PROMPT.format( + vuln_id=vuln_id, + target_package=target_package, + cve_description=cve_description, + policy_context_section=policy_context_section, + downstream_section=downstream_section, + upstream_section=upstream_section, + l1_agent_section=l1_agent_section, + l2_context_section=l2_context_section, + build_agent_status_section=build_agent_status_section, + override_notice_section=override_notice_section, + ) + + report_llm = llm.with_structured_output(CodeAgentReport) + + has_l2_override = l2_result is not None and l2_result.l2_override_verdict is not None + with tracer.push_active_function( + "generate_report", + input_data={ + "vuln_id": vuln_id, + "target_package": target_package, + "has_downstream_patch": downstream_report.is_patch_file_available if downstream_report else False, + "has_upstream_patch": upstream_report.is_fixed_srpm_is_needed if upstream_report else False, + "has_l1_answer": l1_agent_answer is not None, + "has_l2_override": has_l2_override, + }, + ) as span: + messages = [ + SystemMessage(content=prompt_text), + HumanMessage(content="Generate the report."), + ] + report: CodeAgentReport = await report_llm.ainvoke(messages) + + snippet_source = "unchanged" + downstream_patch_snippet_count_pre_cap = 0 + if downstream_report and downstream_report.parsed_patch: + raw = _extract_downstream_patch_code_snippets(downstream_report) + downstream_patch_snippet_count_pre_cap = len(raw) + ranked = _rank_patch_snippets_for_relevance(raw, report.affected_files) + report.code_snippets = _cap_snippets_by_type(ranked) + snippet_source = "downstream_patch" + elif upstream_report and upstream_report.fixed_parsed_patch: + raw = _extract_code_snippets(downstream_report, upstream_report) + ranked = _rank_patch_snippets_for_relevance(raw, report.affected_files) + report.code_snippets = _cap_snippets_by_type(ranked) + snippet_source = "upstream_patch" + elif not report.code_snippets: + report.code_snippets = _extract_code_snippets(downstream_report, upstream_report) + + span.set_output({ + "justification_label": report.justification_label, + "affected_files_count": len(report.affected_files), + "caveats_count": len(report.caveats), + "code_snippets_count": len(report.code_snippets), + "snippet_source": snippet_source, + "downstream_patch_snippet_count_pre_cap": downstream_patch_snippet_count_pre_cap, + }) + + logger.info( + "generate_code_agent_report: justification=%s", + report.justification_label, + ) + + return report + + +# --------------------------------------------------------------------------- +# Diff and patch helpers +# --------------------------------------------------------------------------- + + +def download_patch_and_gen_diff(fix_info: dict, brew_downloader: BrewDownloader, source_dir: Path, patch_dir: Path) -> Path | None: + """Download the patched SRPM and generate the diff file between the source and the patched SRPM.""" + from exploit_iq_commons.utils.source_rpm_downloader import SourceRPMDownloader + + srpm_path = brew_downloader.download_patched_srpm_by_nevra(fix_info["nevra"]) + if srpm_path is None: + srpm_path = brew_downloader.download_patched_srpm(fix_info["name"], fix_info["version"], fix_info["release"],) + if srpm_path is not None: + patch_dir.mkdir(parents=True, exist_ok=True) + shutil.copy2(srpm_path, patch_dir) + SourceRPMDownloader.extract_src_rpm(srpm_path, patch_dir) + + #diff_text = _generate_tree_diff(source_dir, patch_dir) + #diff_output_path = patch_dir.parent / "locate.diff" + #diff_output_path.write_text(diff_text, encoding="utf-8") + #return diff_output_path + return None + + +# --------------------------------------------------------------------------- +# Spec/build log parsing helpers +# --------------------------------------------------------------------------- + +_SPEC_PATCH_RE = re.compile(r"^Patch(\d+)\s*:\s*(.+)$", re.IGNORECASE) + + +def _parse_spec_patch_directives( + inspector, spec_path: Path, +) -> list[tuple[int, str, str]]: + """Return ``[(index, filename, raw_line), ...]`` from ``PatchN:`` lines.""" + matches = inspector.grep_content(_SPEC_PATCH_RE.pattern, spec_path) + results: list[tuple[int, str, str]] = [] + for m in matches: + hit = _SPEC_PATCH_RE.match(m.line_content.strip()) + if hit: + results.append((int(hit.group(1)), hit.group(2).strip(), m.line_content.strip())) + return results + + +def _extract_spec_changelog(inspector, spec_path: Path) -> str | None: + """Return text after the ``%changelog`` directive, or ``None``.""" + content = inspector.read_file(spec_path) + idx = content.find("%changelog") + if idx == -1: + return None + return content[idx + len("%changelog"):] + + +def find_cve_patch_in_spec( + inspector, + cve_id: str, +) -> SpecPatchMatch | None: + """Find a patch file for a CVE by searching filename or spec comments. + + Search order: + 1. Patch files with CVE in filename (fast path) + 2. PatchN: directives with CVE in the contiguous comment block above + + Returns None if no match or patch file doesn't exist. + """ + cve_pattern = re.compile(re.escape(cve_id), re.IGNORECASE) + + spec_files = inspector.find_files("*.spec", recursive=False) + if not spec_files: + return None + spec_path = spec_files[0] + + patch_files = inspector.find_files("*.patch", recursive=False) + patch_filenames = {p.name for p in patch_files} + + for pf in patch_files: + if cve_pattern.search(pf.name): + for idx, fname, raw_line in _parse_spec_patch_directives(inspector, spec_path): + if fname == pf.name: + return SpecPatchMatch( + patch_index=idx, + patch_filename=pf.name, + raw_directive=raw_line, + comment_block=None, + match_source="filename", + ) + return SpecPatchMatch( + patch_index=0, + patch_filename=pf.name, + raw_directive=f"Patch: {pf.name}", + comment_block=None, + match_source="filename", + ) + + content = inspector.read_file(spec_path) + lines = content.splitlines() + + comment_block: list[str] = [] + for line in lines: + stripped = line.strip() + if stripped.startswith("#"): + comment_block.append(stripped) + elif _SPEC_PATCH_RE.match(stripped): + hit = _SPEC_PATCH_RE.match(stripped) + if hit and comment_block: + combined_comments = "\n".join(comment_block) + if cve_pattern.search(combined_comments): + patch_filename = hit.group(2).strip() + if patch_filename in patch_filenames: + return SpecPatchMatch( + patch_index=int(hit.group(1)), + patch_filename=patch_filename, + raw_directive=stripped, + comment_block=combined_comments, + match_source="comment", + ) + comment_block = [] + else: + comment_block = [] + + return None + + +_BINARY_FILE_EXTENSIONS = frozenset({ + '.uu','.uue','.iso', '.bin', '.gz', '.bz2', '.xz', '.zip', '.tar', '.tgz', '.tbz2', + '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.ico', '.webp', + '.pdf', '.doc', '.docx', '.xls', '.xlsx', + '.exe', '.dll', '.so', '.dylib', '.a', '.o', '.obj', + '.pyc', '.pyo', '.class', '.jar', '.war', + '.woff', '.woff2', '.ttf', '.otf', '.eot', + '.mp3', '.mp4', '.wav', '.avi', '.mov', '.mkv', + '.db', '.sqlite', '.sqlite3', +}) + + +def _is_binary_file_path(path: str) -> bool: + """Check if file path has a binary file extension.""" + path_lower = path.lower() + return any(path_lower.endswith(ext) for ext in _BINARY_FILE_EXTENSIONS) + + +def parse_patch_file(patch_path: Path) -> ParsedPatch | None: + """Parse a downstream .patch file into structured data. + + Returns None if the file cannot be parsed. + """ + try: + diff_text = patch_path.read_text(encoding="utf-8", errors="replace") + patch_set = PatchSet.from_string(diff_text) + except Exception: + logger.warning("parse_patch_file: failed to parse %s", patch_path) + return None + + files: list[PatchFile] = [] + for patched_file in patch_set: + if patched_file.is_binary_file: + continue + if _is_binary_file_path(patched_file.target_file): + continue + + hunks: list[PatchHunk] = [] + for hunk in patched_file: + context, removed, added = [], [], [] + for line in hunk: + if line.is_context: + context.append(str(line.value).rstrip("\n")) + elif line.is_removed: + removed.append(str(line.value).rstrip("\n")) + elif line.is_added: + added.append(str(line.value).rstrip("\n")) + + hunks.append(PatchHunk( + source_start=hunk.source_start, + source_length=hunk.source_length, + target_start=hunk.target_start, + target_length=hunk.target_length, + context_lines=context, + removed_lines=removed, + added_lines=added, + )) + + files.append(PatchFile( + source_path=patched_file.source_file, + target_path=patched_file.target_file, + hunks=hunks, + is_new_file=patched_file.is_added_file, + is_deleted_file=patched_file.is_removed_file, + )) + + return ParsedPatch(patch_filename=patch_path.name, files=files) + + +# --------------------------------------------------------------------------- +# Downstream search pipeline +# --------------------------------------------------------------------------- +def _populate_downstream_spec_and_build( + report: DownstreamSearchReport, + inspector, + cve_pattern: str, + build_log_path: Path | None, +) -> None: + """Inspect target spec and build log for CVE fix evidence (runs with or without a patch file).""" + from vuln_analysis.tools.source_inspector import SourceInspector + + spec_files = inspector.find_files("*.spec", recursive=False) + spec_path = spec_files[0] if spec_files else None + + if not spec_path: + report.is_patch_in_spec_file = False + else: + cve_c = re.compile(cve_pattern, re.IGNORECASE) + for _idx, fname, raw_line in _parse_spec_patch_directives(inspector, spec_path): + if cve_c.search(fname): + report.spec_patch_directives_for_cve.append(raw_line) + chlog = _extract_spec_changelog(inspector, spec_path) + if chlog: + cve_in_cl = [ln for ln in chlog.splitlines() if cve_c.search(ln)] + report.spec_changelog_cve_lines = "\n".join(cve_in_cl) + grep_spec_matches = inspector.grep_content(cve_pattern, spec_path) + if grep_spec_matches: + report.is_patch_in_spec_file = True + report.spec_file_log_change = "\n".join(m.line_content for m in grep_spec_matches) + else: + report.is_patch_in_spec_file = False + + source0_matches = inspector.grep_content(r"^Source0:", spec_path) + if source0_matches: + report.spec_source0_line = source0_matches[0].line_content.strip() + version_matches = inspector.grep_content(r"^Version:", spec_path) + if version_matches: + report.spec_version_line = version_matches[0].line_content.strip() + + if build_log_path and build_log_path.exists(): + build_inspector = SourceInspector(build_log_path.parent) + build_log_matches = build_inspector.grep_content(cve_pattern, build_log_path) + if build_log_matches: + report.is_patch_applied_in_build = True + report.build_log_patch_applied = "\n".join(m.line_content for m in build_log_matches) + else: + report.is_patch_applied_in_build = False + else: + report.is_patch_applied_in_build = False + + +async def downstream_search_preprocss( + *, + llm, + vuln_id: str, + descriptions: list[tuple[str, str]], + source_path: Path, + build_log_path: Path | None, + tracer, +) -> DownstreamSearchReport: + """Build the downstream search pipeline.""" + from vuln_analysis.tools.source_inspector import SourceInspector + inspector = SourceInspector(source_path) + + cve_pattern = re.escape(vuln_id) + report = DownstreamSearchReport() + patch_file = None + with tracer.push_active_function("Is_patch_file_available", input_data={"vuln_id": vuln_id}) as span: + spec_match = find_cve_patch_in_spec(inspector, vuln_id) + if spec_match: + report.is_patch_file_available = True + patch_file = inspector.root / spec_match.patch_filename + report.patch_file_name = spec_match.patch_filename + if spec_match.match_source == "comment": + report.spec_patch_directives_for_cve.append(spec_match.raw_directive) + span.set_output({ + "patch_found": True, + "match_source": spec_match.match_source, + "patch_filename": spec_match.patch_filename, + }) + else: + report.is_patch_file_available = False + span.set_output({"patch_found": False}) + + with tracer.push_active_function( + "Is_patch_in_spec_file", + input_data={"patch_file_name": report.patch_file_name or "none"}, + ) as span: + _populate_downstream_spec_and_build(report, inspector, cve_pattern, build_log_path) + + if patch_file: + with tracer.push_active_function( + "Extract_patch_details", input_data={"patch_file_name": patch_file.name} + ) as span: + details = parse_patch_file(patch_file) + report.parsed_patch = details if details else None + + return report + +async def upstream_search_preprocess( + *, + vuln_id: str, + source_path: Path, + fix_info: dict, + brew_downloader: BrewDownloader, + patch_dir: Path, + target_package: TargetPackage, + tracer, + intel: list | None = None, + commit_url_candidates: dict[str, list[str]] | None = None, + cve_description: str | None = None, + llm: "BaseChatModel | None" = None, +) -> UpstreamSearchReport: + """Build the upstream search pipeline. + + Args: + intel: Optional list of CveIntel objects for Ubuntu patch lookup. + commit_url_candidates: Optional dict of URLs from intel references for patch fetching. + cve_description: Optional CVE description for Chromium CL disambiguation. + llm: Optional LangChain LLM for Chromium CL selection when multiple MERGED CLs exist. + """ + from vuln_analysis.tools.source_inspector import SourceInspector + inspector = SourceInspector(source_path) + report = UpstreamSearchReport() + cve_pattern = re.escape(vuln_id) + need_to_find_code = True + # Store reference package NVR from fix_info if available + if fix_info and fix_info.get("nevra"): + report.reference_package_nvr = fix_info["nevra"] + + with tracer.push_active_function("Is_upstream_fixed_by_rebase", input_data={"vuln_id": vuln_id}) as span: + spec_files = inspector.find_files("*.spec", recursive=False) + spec_path = spec_files[0] if spec_files else None + + if not spec_path: + report.is_code_fixed_by_rebase = "unknown" + else: + grep_spec_matches = inspector.grep_content(cve_pattern, spec_path) + if grep_spec_matches: + report.is_code_fixed_by_rebase = "yes" + report.spec_file_log_change = "\n".join(m.line_content for m in grep_spec_matches) + else: + report.is_code_fixed_by_rebase = "unknown" + span.set_output({ + "is_code_fixed_by_rebase": report.is_code_fixed_by_rebase, + "spec_file_log_change": report.spec_file_log_change, + }) + + if patch_dir.exists(): + shutil.rmtree(patch_dir, ignore_errors=True) + + if fix_info and brew_downloader is not None: + with tracer.push_active_function( + "download_rpm_patch", input_data={"fix_info": fix_info} + ) as span: + try: + download_patch_and_gen_diff(fix_info, brew_downloader, source_path, patch_dir) + span.set_output({"patch_dir_exists": patch_dir.exists()}) + except Exception as e: + logger.warning("locate: failed to download/extract patched SRPM: %s", e) + span.set_output({"error": str(e), "patch_dir_exists": False}) + + if patch_dir.exists(): + patch_inspector = SourceInspector(patch_dir) + with tracer.push_active_function("is_patch_downsteam_patch_file", input_data={"patch_dir": patch_dir}) as span: + spec_match = find_cve_patch_in_spec(patch_inspector, vuln_id) + if spec_match: + report.is_fixed_srpm_is_needed = True + report.fixed_srpm_file_name = spec_match.patch_filename + report.fixed_parsed_patch = parse_patch_file(patch_inspector.root / spec_match.patch_filename) + span.set_output({ + "is_fixed_srpm_is_needed": True, + "match_source": spec_match.match_source, + "patch_filename": spec_match.patch_filename, + }) + return report + else: + report.is_fixed_srpm_is_needed = False + span.set_output({ + "is_fixed_srpm_is_needed": report.is_fixed_srpm_is_needed}) + + # Try intel references (unified: commit URLs from GHSA/NVD/RHSA/Ubuntu) + if (not patch_dir.exists() or need_to_find_code) and not report.fixed_parsed_patch: + if commit_url_candidates: + from vuln_analysis.utils.web_patch_fetcher import WebPatchFetcher + with tracer.push_active_function( + "fetch_patch_from_intel_refs", + input_data={ + "vuln_id": vuln_id, + "candidates_count": {src: len(urls) for src, urls in commit_url_candidates.items()}, + } + ) as span: + async with aiohttp.ClientSession() as session: + fetcher = WebPatchFetcher(session=session) + result = await fetcher.fetch_from_intel_refs( + commit_url_candidates, + vuln_id, + cve_description=cve_description, + llm=llm, + ) + if result and result.parsed_patch: + report.fixed_parsed_patch = result.parsed_patch + report.fixed_srpm_file_name = result.patch_url + report.is_fixed_srpm_is_needed = True + report.osv_result = result + span.set_output({ + "source_found": result.source, + "url_type": result.url_type, + "platform": result.platform, + "patch_url": result.patch_url, + "commit_message": result.commit_message, + "patch_found": True, + }) + else: + span.set_output({"patch_found": False}) + + # OSV API fallback + if (not patch_dir.exists() or need_to_find_code) and not report.fixed_parsed_patch: + from vuln_analysis.utils.web_patch_fetcher import WebPatchFetcher, OSVClient + with tracer.push_active_function("fetch_patch_from_osv", input_data={"vuln_id": vuln_id}) as span: + async with aiohttp.ClientSession() as session: + fetcher = WebPatchFetcher(session=session) + client = OSVClient(session=session, patch_fetcher=fetcher) + result = await client.get_fix_patch(vuln_id, target_package.version, target_package.name) + if result and result.parsed_patch: + report.fixed_parsed_patch = result.parsed_patch + report.fixed_srpm_file_name = result.patch_url + report.is_fixed_srpm_is_needed = True + report.osv_result = result + span.set_output({ + "source_found": "osv", + "platform": result.platform, + "patch_url": result.patch_url, + "commit_message": result.commit_message, + "patch_found": True, + }) + else: + span.set_output({"patch_found": False}) + return report + diff --git a/src/vuln_analysis/functions/cve_build_agent.py b/src/vuln_analysis/functions/cve_build_agent.py new file mode 100644 index 000000000..a0c4a65d2 --- /dev/null +++ b/src/vuln_analysis/functions/cve_build_agent.py @@ -0,0 +1,784 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Level 2 Build Agent for Package Vulnerability Checker. + +Performs BuildCompilationCheck (Phase 1) and HardeningCheck (Phase 2) to +determine if vulnerable code identified by L1 is actually compiled into +the binary and whether hardening flags provide mitigation. +""" + +from pathlib import Path +from enum import StrEnum + +from aiq.builder.builder import Builder +from aiq.builder.framework_enum import LLMFrameworkEnum +from aiq.builder.function_info import FunctionInfo +from aiq.cli.register_workflow import register_function +from aiq.data_models.function import FunctionBaseConfig +from pydantic import Field + +from exploit_iq_commons.logging.loggers_factory import LoggingFactory, trace_id +from exploit_iq_commons.data_models.checker_status import L2BuildResult + +from langgraph.graph import StateGraph, START, END +from langgraph.prebuilt import ToolNode +from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, RemoveMessage + +from nat.builder.context import Context +from exploit_iq_commons.data_models.input import AgentMorpheusEngineInput + +from vuln_analysis.functions.react_internals import CheckerThought, CodeFindings, Observation, FORCED_FINISH_PROMPT, check_empty_output + +from vuln_analysis.functions.build_agent_graph_defs import ( + BuildAgentState, + BuildHarvestReport, + harvest_build_data, + L2CompileVerdictExtraction, + L2HardeningVerdictExtraction, +) +from vuln_analysis.utils.rpm_checker_prompts import ( + L2_CONFIG_PROMPT_TEMPLATE, + L2_CONFIG_SYS_PROMPT, + L2_CONFIG_THOUGHT_INSTRUCTIONS, + L2_CONFIG_PROMPT_SPEC_ONLY_TEMPLATE, + L2_CONFIG_SPEC_ONLY_SYS_PROMPT, + L2_CONFIG_SPEC_ONLY_THOUGHT_INSTRUCTIONS, + L2_COMPREHENSION_PROMPT, + L2_MEMORY_UPDATE_PROMPT, + L2_HARDENING_PROMPT_TEMPLATE, + L2_HARDENING_SYS_PROMPT, + L2_HARDENING_THOUGHT_INSTRUCTIONS, + L2_COMPILATION_VERDICT_PROMPT, + L2_HARDENING_VERDICT_PROMPT, + L2_KERNEL_CONFIG_SYS_PROMPT, + L2_KERNEL_CONFIG_PROMPT_TEMPLATE, + L2_KERNEL_THOUGHT_INSTRUCTIONS, +) +from vuln_analysis.runtime_context import ctx_state +from vuln_analysis.utils.token_utils import truncate_tool_output +import uuid +import tiktoken +logger = LoggingFactory.get_agent_logger(__name__) + + +class CVEBuildAgentConfig(FunctionBaseConfig, name="cve_build_agent"): + """ + Level 2 Build Agent. Analyzes build artifacts to determine if vulnerable + code is compiled into the binary and whether hardening flags mitigate. + + Phase 1: BuildCompilationCheck - Is vulnerable code compiled? + Phase 2: HardeningCheck - Do hardening flags mitigate the CVE? + """ + + base_checker_dir: str = Field( + default=".cache/am_cache/checker", + description="Root directory for checker-specific artifacts.", + ) + max_iterations: int = Field( + default=5, + description="The maximum number of iterations for the agent.", + ) + llm_name: str = Field(description="The LLM model to use with the L1 code agent.") + tool_names: list[str] = Field(default=[], description="The list of tools to provide to L1 code agent") + context_window_token_limit: int = Field(default=5000, description="Token limit for context window before pruning old messages.") + +def _build_tool_strategy(tool_names: list[str]) -> str: + """Generate tool usage guidance based on available tools.""" + strategies = [] + tool_names_lower = [t.lower().replace("_", " ") for t in tool_names] + + if any("grep" in t for t in tool_names_lower): + strategies.append("- Use Source Grep for exact code patterns from patch (function names, variable names, specific code)") + if any("keyword" in t or "search" in t for t in tool_names_lower): + strategies.append("- Use Code Keyword Search for broader concept searches when grep fails") + if any("read" in t for t in tool_names_lower): + strategies.append("- Use Read File to examine full context around matches") + + return "\n".join(strategies) if strategies else "Use available tools to search for vulnerable and fixed code patterns." + + +class L2InvestigationPhase(StrEnum): + CONFIGURATION = "configuration" + HARDENING = "hardening" + + +async def create_graph_build_agent( + config: CVEBuildAgentConfig, + builder: Builder, + state: AgentMorpheusEngineInput, + tracer, +): + """Build the L2 Build Agent LangGraph. + + Graph structure: + START -> data_harvest_node -> thought_node -+-> END (finish) + | + +-> tool_node -> observation_node -> thought_node + """ + # Node name constants + DATA_HARVEST_NODE = "data_harvest" + THOUGHT_NODE = "thought_node" + TOOL_NODE = "tool_node" + OBSERVATION_NODE = "observation_node" + FORCED_FINISH_NODE = "forced_finish" + INVESTIGATION_PHASE_NODE = "investigation_phase" + llm = await builder.get_llm(llm_name=config.llm_name, wrapper_type=LLMFrameworkEnum.LANGCHAIN) + tools = builder.get_tools(tool_names=config.tool_names, wrapper_type=LLMFrameworkEnum.LANGCHAIN) + + thought_llm = llm.with_structured_output(CheckerThought) + comprehension_llm = llm.with_structured_output(CodeFindings) + observation_llm = llm.with_structured_output(Observation) + compilation_verdict_llm = llm.with_structured_output(L2CompileVerdictExtraction) + hardening_verdict_llm = llm.with_structured_output(L2HardeningVerdictExtraction) + tools_node = ToolNode(tools, handle_tool_errors=True) if tools else None + enabled_tool_names = [tool.name for tool in tools] + tool_descriptions_list = [t.name + ": " + t.description for t in tools] + tools_str = "\n".join(tool_descriptions_list) + tool_strategy = _build_tool_strategy(enabled_tool_names) + # Extract context from state (guaranteed by early exit checks in _arun) + ctx = state.info.checker_context + assert ctx is not None, "checker_context must exist (checked in _arun)" + assert ctx.l1_result is not None, "l1_result must exist (checked in _arun)" + # build_log_path is now optional - None means spec-only mode + assert ctx.source_key, "source_key must exist when artifacts exist" + + l1_result = ctx.l1_result + artifacts = ctx.artifacts + target_package = state.input.image.target_package + vuln_id = state.input.scan.vulns[0].vuln_id + + # Paths + source_key = ctx.source_key + checker_dir = Path(config.base_checker_dir) / source_key + + build_log_path = Path(artifacts.build_log_path) if artifacts.build_log_path else None + + # L1 results - use full VulnerabilityIntel for richer context + vulnerability_intel = l1_result.vulnerability_intel + vulnerability_intel_str = vulnerability_intel.format_for_prompt() if vulnerability_intel else "No intel available" + l1_preliminary_verdict = l1_result.preliminary_verdict + + # Extract CWE ID from intel (if available) + cwe_id = None + intel_list = state.info.intel + intel = intel_list[0] + if intel.nvd and intel.nvd.cwe_id: + cwe_id = intel.nvd.cwe_id + logger.info("build_agent: CWE ID from intel: %s", cwe_id) + + _tiktoken_enc = tiktoken.get_encoding("cl100k_base") + + investigation_stack: list[L2InvestigationPhase] = [] + investigation_stack.append(L2InvestigationPhase.HARDENING) + investigation_stack.append(L2InvestigationPhase.CONFIGURATION) + # ------------------------------------------------------------------------- + # L2 graph nodes + # ------------------------------------------------------------------------- + def _count_tokens(text: str) -> int: + """Count tokens using tiktoken cl100k_base encoding (~90-95% accurate for Llama 3.1).""" + try: + return len(_tiktoken_enc.encode(text)) + except Exception: + return len(text) // 4 + + def _estimate_tokens(runtime_prompt: str, messages: list, observation: Observation | None) -> int: + """Estimate the token count thought_node will send to the LLM.""" + parts = [runtime_prompt] + for msg in messages: + if hasattr(msg, "content") and isinstance(msg.content, str): + parts.append(msg.content) + if observation is not None: + for item in (observation.memory or []): + parts.append(item) + for item in (observation.results or []): + parts.append(item) + return _count_tokens("\n".join(parts)) + # ------------------------------------------------------------------------- + # Data Harvest Node + # ------------------------------------------------------------------------- + async def build_runtime_prompt(harvest_report: BuildHarvestReport) -> str: + """Generate the runtime prompt for the current investigation phase.""" + current_phase = investigation_stack[-1] + runtime_prompt = "" + if current_phase == L2InvestigationPhase.CONFIGURATION: + # Kernel packages use CONFIG-based checking (not build log grep) + if harvest_report.is_kernel: + logger.info("Using kernel-specific prompt template") + runtime_prompt = L2_KERNEL_CONFIG_PROMPT_TEMPLATE.format( + sys_prompt=L2_KERNEL_CONFIG_SYS_PROMPT, + vuln_id=vuln_id, + target_package=target_package.name, + vulnerability_intel=vulnerability_intel_str, + l1_preliminary_verdict=l1_preliminary_verdict, + kernel_config_path=harvest_report.kernel_config_path or "Not available", + kernel_source_root=harvest_report.kernel_source_root or "Not available", + tools=tools_str, + tool_instructions=L2_KERNEL_THOUGHT_INSTRUCTIONS, + ) + elif build_log_path: + # Full mode: build log + spec file (standard packages) + runtime_prompt = L2_CONFIG_PROMPT_TEMPLATE.format( + sys_prompt=L2_CONFIG_SYS_PROMPT, + vuln_id=vuln_id, + target_package=target_package.name, + vulnerability_intel=vulnerability_intel_str, + l1_preliminary_verdict=l1_preliminary_verdict, + disabled_features=harvest_report.disabled_features, + spec_disabled_features=harvest_report.spec_disabled_features, + enabled_features=harvest_report.enabled_features, + spec_enabled_features=harvest_report.spec_enabled_features, + linked_libraries=harvest_report.linked_libraries or "None", + built_subpackages=harvest_report.built_subpackages or "None", + excluded_subpackages=harvest_report.excluded_subpackages or "None", + kernel_config=harvest_report.kernel_config or "N/A (not a kernel package)", + tools=tools_str, + tool_instructions=L2_CONFIG_THOUGHT_INSTRUCTIONS, + ) + else: + # Spec-only mode: no build log available + logger.info("Using spec-only prompt template (no build log)") + runtime_prompt = L2_CONFIG_PROMPT_SPEC_ONLY_TEMPLATE.format( + sys_prompt=L2_CONFIG_SPEC_ONLY_SYS_PROMPT, + vuln_id=vuln_id, + target_package=target_package.name, + vulnerability_intel=vulnerability_intel_str, + l1_preliminary_verdict=l1_preliminary_verdict, + spec_disabled_features=harvest_report.spec_disabled_features, + spec_enabled_features=harvest_report.spec_enabled_features, + built_subpackages=harvest_report.built_subpackages or "None", + excluded_subpackages=harvest_report.excluded_subpackages or "None", + kernel_config=harvest_report.kernel_config or "N/A (not a kernel package)", + tools=tools_str, + tool_instructions=L2_CONFIG_SPEC_ONLY_THOUGHT_INSTRUCTIONS, + ) + return runtime_prompt + elif current_phase == L2InvestigationPhase.HARDENING: + runtime_prompt = L2_HARDENING_PROMPT_TEMPLATE.format( + sys_prompt=L2_HARDENING_SYS_PROMPT, + vuln_id=vuln_id, + target_package=target_package.name, + cwe_id=cwe_id, + expected_hardening_table=harvest_report.expected_hardening, + tools=tools_str, + tool_instructions=L2_HARDENING_THOUGHT_INSTRUCTIONS, + ) + return runtime_prompt + else: + raise ValueError(f"Unknown investigation phase: {current_phase}") + + async def data_harvest_node(state: BuildAgentState) -> dict: + """Harvest structured data from build log, spec, and source tree before the ReAct loop.""" + logger.info("data_harvest_node: starting") + + with tracer.push_active_function("data_harvest", input_data={}) as span: + # Find spec file if available + spec_path = None + if checker_dir and checker_dir.exists(): + spec_files = list((checker_dir / "source").glob("*.spec")) + spec_path = spec_files[0] if spec_files else None + + source_path = checker_dir / "source" if checker_dir else None + harvest_report = await harvest_build_data( + build_log_path=build_log_path, + spec_path=spec_path, + cwe_id=cwe_id, + source_path=source_path, + package_name=target_package.name if target_package else None, + arch=target_package.arch if target_package else None, + ) + + runtime_prompt = await build_runtime_prompt(harvest_report) + + affected_files_count = len(vulnerability_intel.affected_files) if vulnerability_intel else 0 + # Compute evidence sources based on available artifacts + evidence_sources: list[str] = [] + if build_log_path: + evidence_sources.append("build_log") + if spec_path: + evidence_sources.append("spec_file") + + span.set_output({ + "disabled_features_count": len(harvest_report.disabled_features), + "spec_disabled_features_count": len(harvest_report.spec_disabled_features), + "enabled_features_count": len(harvest_report.enabled_features), + "spec_enabled_features_count": len(harvest_report.spec_enabled_features), + "expected_hardening_count": len(harvest_report.expected_hardening), + "vulnerability_intel_files_count": affected_files_count, + "linked_libraries_count": len(harvest_report.linked_libraries), + "built_subpackages_count": len(harvest_report.built_subpackages), + "excluded_subpackages_count": len(harvest_report.excluded_subpackages), + "kernel_config_count": len(harvest_report.kernel_config), + "evidence_sources": evidence_sources, + }) + + return { + "harvest_report": harvest_report, + "vulnerability_intel_str": vulnerability_intel_str, + "l1_preliminary_verdict": l1_preliminary_verdict, + "runtime_prompt": runtime_prompt, + "evidence_sources": evidence_sources, + "messages": [AIMessage(content="Build data harvested, beginning analysis.")], + } + + async def thought_node(state: BuildAgentState) -> dict: + """ReAct step: LLM chooses tools or emits a final answer for the current investigation phase.""" + step_num = state.get("step", 0) + logger.info("thought_node: starting step %d", step_num) + + runtime_prompt = state.get("runtime_prompt") + _messages = [SystemMessage(content=runtime_prompt)] + state["messages"] # Reserved for LLM call + + with tracer.push_active_function("thought_node", input_data={}) as span: + obs = state.get("observation", None) + if obs is not None: + memory_list = obs.memory if obs.memory else ["No prior knowledge."] + recent_findings = obs.results if obs.results else ["No recent findings."] + memory_context = "\n".join(f"- {m}" for m in memory_list) + findings_context = "\n".join(f"- {f}" for f in recent_findings) + context_block = f"KNOWLEDGE:\n{memory_context}\nLATEST FINDINGS:\n{findings_context}" + _messages.append(SystemMessage(content=context_block)) + + response: CheckerThought = await thought_llm.ainvoke(_messages) + + if response.mode == "finish": + ai_message = AIMessage(content=response.final_answer or "Analysis complete.") + else: + tool_name = response.actions.tool + arguments = response.actions.query + tool_call_id = str(uuid.uuid4()) + ai_message = AIMessage( + content=response.thought, + tool_calls=[{"name": tool_name, "args": {"query": arguments}, "id": tool_call_id}] + ) + span.set_output({ + "thought": response.thought, + "mode": response.mode, + "actions": response.actions, + "final_answer": response.final_answer, + }) + + return { + "messages": [ai_message], + "thought": response, + "step": step_num + 1, + "max_steps": config.max_iterations, + } + + async def observation_node(state: BuildAgentState) -> dict: + """Process tool output: comprehension -> memory update for build analysis.""" + logger.info("observation_node: starting") + tool_message = state["messages"][-1] + last_thought = state.get("thought") + if not last_thought: + return { + "messages": [AIMessage(content="No thought found")], + } + last_thought_text = last_thought.thought + tool_used = last_thought.actions.tool + tool_input_detail = last_thought.actions.query + previous_memory = state.get("observation").memory if state.get("observation") else ["No data gathered yet."] + + harvest_report = state.get("harvest_report") or BuildHarvestReport() + target_package_name = target_package.name if target_package else "unknown" + + with tracer.push_active_function("observation_node", input_data=f"tool used:{tool_used} + {tool_input_detail}") as span: + tool_output_for_llm = tool_message.content + + # Check for empty/error outputs - bypass LLM if so to prevent hallucination + empty_findings = check_empty_output(tool_output_for_llm, tool_used, tool_input_detail) + if empty_findings: + # Build-specific: empty grep for file in logs = NOT_COMPILED evidence + if tool_used == "Source Grep" and "logs:" in tool_input_detail: + file_searched = tool_input_detail.replace("logs:", "") + empty_findings = CodeFindings( + findings=[ + f"Source Grep for '{file_searched}' returned EMPTY - file not in build log", + "This indicates the file was NOT compiled in this build" + ], + tool_outcome=f"CALLED: Source Grep with {tool_input_detail} -> EMPTY (not compiled)" + ) + code_findings = empty_findings + else: + # Step 1: Comprehension - extract findings from tool output + comp_prompt = L2_COMPREHENSION_PROMPT.format( + vuln_id=vuln_id, + target_package=target_package_name, + vulnerability_intel=vulnerability_intel_str, + disabled_features=", ".join(harvest_report.disabled_features) if harvest_report.disabled_features else "None", + spec_disabled_features=", ".join(harvest_report.spec_disabled_features) if harvest_report.spec_disabled_features else "None", + enabled_features=", ".join(harvest_report.enabled_features) if harvest_report.enabled_features else "None", + spec_enabled_features=", ".join(harvest_report.spec_enabled_features) if harvest_report.spec_enabled_features else "None", + tool_used=tool_used, + tool_input=tool_input_detail, + last_thought=last_thought_text, + tool_output=truncate_tool_output(tool_output_for_llm, tool_used, max_tokens=1000), + ) + code_findings: CodeFindings = await comprehension_llm.ainvoke([SystemMessage(content=comp_prompt)]) + findings_text = "\n".join(f"- {f}" for f in code_findings.findings) + + # Step 2: Memory update - merge findings into cumulative memory + mem_prompt = L2_MEMORY_UPDATE_PROMPT.format( + vuln_id=vuln_id, + target_package=target_package_name, + previous_memory="\n".join(f"- {m}" for m in previous_memory) if isinstance(previous_memory, list) else previous_memory, + findings=findings_text, + tool_outcome=code_findings.tool_outcome, + ) + new_observation: Observation = await observation_llm.ainvoke([SystemMessage(content=mem_prompt)]) + + messages = state["messages"] + active_prompt = state.get("runtime_prompt") or "" + estimated = _estimate_tokens(active_prompt, messages, new_observation) + orig_estimated = estimated + prune_messages = [] + if estimated > config.context_window_token_limit: + with tracer.push_active_function("context_pruning", input_data={"estimated_tokens": estimated, "limit": config.context_window_token_limit}) as prune_span: + + for msg in messages: + prune_messages.append(RemoveMessage(id=msg.id)) + estimated -= _count_tokens(msg.content) if hasattr(msg, "content") and isinstance(msg.content, str) else 0 + if estimated <= config.context_window_token_limit: + break + logger.info( + "Context pruning: removed %d messages, estimated tokens now ~%d (limit %d)", + len(prune_messages), estimated, config.context_window_token_limit, + ) + prune_span.set_output({ + "pruning_triggered": len(prune_messages) > 0, + "messages_pruned": len(prune_messages), + "tokens_before": orig_estimated, + "tokens_after": estimated, + }) + + + span.set_output({ + "last_thought_text": last_thought_text, + "tool_output_for_llm": tool_output_for_llm[:500], + "findings": code_findings.findings, + "tool_outcome": code_findings.tool_outcome, + "new_memory": new_observation.memory, + "amount_of_orig_tokens": orig_estimated, + "amount_of_estimated_tokens": estimated, + }) + return { + "messages": prune_messages, + "observation": new_observation, + } + + async def forced_finish_node(state: BuildAgentState) -> dict: + """Force finish when max iterations reached. + + Invokes the LLM with FORCED_FINISH_PROMPT to generate a final answer + based on evidence gathered so far. + """ + step_num = state.get("step", 0) + with tracer.push_active_function("forced_finish_node", input_data=f"step:{step_num}") as span: + try: + active_prompt = state.get("runtime_prompt") or "" + messages = [SystemMessage(content=active_prompt)] + state["messages"] + messages.append(HumanMessage(content=FORCED_FINISH_PROMPT)) + + obs = state.get("observation") + if obs is not None and obs.memory: + memory_context = "\n".join(f"- {m}" for m in obs.memory) + messages.append(SystemMessage(content=f"KNOWLEDGE:\n{memory_context}")) + + response: CheckerThought = await thought_llm.ainvoke(messages) + + if response.mode == "finish" and response.final_answer: + ai_message = AIMessage(content=response.final_answer) + final_answer = response.final_answer + else: + final_answer = "Unable to determine compilation status within iteration limit." + ai_message = AIMessage(content=final_answer) + response = CheckerThought( + thought=response.thought or "Max steps exceeded", + mode="finish", + actions=None, + final_answer=final_answer, + ) + + span.set_output({"final_answer_length": len(final_answer), "step": step_num}) + return { + "messages": [ai_message], + "thought": response, + "step": step_num, + "max_steps": state.get("max_steps", config.max_iterations), + "observation": state.get("observation"), + "output": final_answer, + } + except Exception as e: + logger.exception("forced_finish_node failed at step %d", step_num) + span.set_output({"error": str(e), "exception_type": type(e).__name__, "step": step_num}) + raise + + async def should_continue(state: BuildAgentState) -> str: + """Route based on thought mode.""" + thought = state.get("thought") + if thought is not None and thought.mode == "finish": + return INVESTIGATION_PHASE_NODE + if state.get("step", 0) >= state.get("max_steps", config.max_iterations): + return FORCED_FINISH_NODE + return TOOL_NODE + + + async def is_investigation_finished(state: BuildAgentState) -> str: + """Check if the investigation is finished.""" + if len(investigation_stack) == 0: + return END + return THOUGHT_NODE + + async def investigation_phase_node(state: BuildAgentState) -> dict: + """Determine the next investigation phase.""" + if len(investigation_stack) == 0: + raise ValueError("Investigation stack is empty") + + final_answer = None + thought = state.get("thought") + if thought and thought.mode == "finish": + final_answer = thought.final_answer + + current_phase = investigation_stack[-1] + with tracer.push_active_function("investigation_phase_node", input_data=f"phase :{current_phase}") as span: + investigation_stack.pop() + if current_phase == L2InvestigationPhase.CONFIGURATION: + verdict: L2CompileVerdictExtraction = await compilation_verdict_llm.ainvoke([SystemMessage(content=L2_COMPILATION_VERDICT_PROMPT.format(final_answer=final_answer))]) + span.set_output({ + "compilation_status": verdict.compilation_status, + "confidence": verdict.confidence, + "reasoning": verdict.reasoning, + }) + if verdict.compilation_status == "not_compiled": + return {"L2CompileVerdict": verdict} + + # Check if this is a kernel package - skip hardening phase + # RHEL kernels always have hardening enabled, so checking provides low signal + preprocess_data = state.get("harvest_report") or BuildHarvestReport() + if preprocess_data.is_kernel: + logger.info("investigation_phase_node: kernel package - skipping hardening phase") + # Clear the investigation stack to prevent hardening phase + investigation_stack.clear() + span.set_output({ + "compilation_status": verdict.compilation_status, + "confidence": verdict.confidence, + "reasoning": verdict.reasoning, + "hardening_skipped": True, + "hardening_skip_reason": "kernel_package", + }) + return {"L2CompileVerdict": verdict} + + has_binary = bool( + artifacts + and artifacts.binary_rpm_path + and Path(artifacts.binary_rpm_path).exists() + ) + if not build_log_path and not has_binary: + logger.info( + "investigation_phase_node: no build log or binary RPM - skipping hardening phase", + ) + investigation_stack.clear() + span.set_output({ + "compilation_status": verdict.compilation_status, + "confidence": verdict.confidence, + "reasoning": verdict.reasoning, + "hardening_skipped": True, + "hardening_skip_reason": "no_build_or_binary_evidence", + }) + return {"L2CompileVerdict": verdict} + + else: + # Note: Architecture mismatch check is now handled early in L1 + # (cve_package_code_agent.py) using target_package.arch. The previous + # L2 check using build_architecture from build logs has been removed + # since build_architecture detection is no longer performed. + + # Normal path: proceed to hardening phase (non-kernel packages) + runtime_prompt = await build_runtime_prompt(preprocess_data) + messages = state["messages"] + prune_messages = [] + for msg in messages: + prune_messages.append(RemoveMessage(id=msg.id)) + span.set_output({ + "runtime_prompt": runtime_prompt, + }) + return { + "runtime_prompt": runtime_prompt, + "thought": None, + "observation": None, + "step": 0, + "messages": prune_messages, + "L2CompileVerdict": verdict, + } + else: + #state that run was hardening need to extract the hardening verdict + verdict: L2HardeningVerdictExtraction = await hardening_verdict_llm.ainvoke([SystemMessage(content=L2_HARDENING_VERDICT_PROMPT.format(final_answer=final_answer))]) + span.set_output({ + "hardening_status": verdict.hardening_status, + "confidence": verdict.confidence, + "reasoning": verdict.reasoning, + }) + return { + "L2HardeningVerdict": verdict, + } + # ------------------------------------------------------------------------- + # Build graph + # ------------------------------------------------------------------------- + + flow = StateGraph(BuildAgentState) + + flow.add_node(DATA_HARVEST_NODE, data_harvest_node) + flow.add_node(THOUGHT_NODE, thought_node) + flow.add_node(FORCED_FINISH_NODE, forced_finish_node) + flow.add_node(OBSERVATION_NODE, observation_node) + flow.add_node(TOOL_NODE, tools_node) + flow.add_node(INVESTIGATION_PHASE_NODE, investigation_phase_node) + + flow.add_edge(START, DATA_HARVEST_NODE) + flow.add_edge(DATA_HARVEST_NODE, THOUGHT_NODE) + edge_map = {INVESTIGATION_PHASE_NODE: INVESTIGATION_PHASE_NODE, FORCED_FINISH_NODE: FORCED_FINISH_NODE, TOOL_NODE: TOOL_NODE} + flow.add_conditional_edges(THOUGHT_NODE, should_continue, edge_map) + flow.add_edge(TOOL_NODE, OBSERVATION_NODE) + flow.add_edge(OBSERVATION_NODE, THOUGHT_NODE) + flow.add_edge(FORCED_FINISH_NODE, INVESTIGATION_PHASE_NODE) + flow.add_conditional_edges(INVESTIGATION_PHASE_NODE, is_investigation_finished, {END: END, THOUGHT_NODE: THOUGHT_NODE}) + + app = flow.compile() + return app + + +@register_function(config_type=CVEBuildAgentConfig, framework_wrappers=[LLMFrameworkEnum.LANGCHAIN]) +async def cve_build_agent(config: CVEBuildAgentConfig, builder: Builder): + """Level 2 Build Agent entry point.""" + + async def _arun(message: AgentMorpheusEngineInput) -> AgentMorpheusEngineInput: + """Run L2 build analysis and populate l2_result on checker_context.""" + trace_id.set(message.input.scan.id) + tracer = Context.get() + + # Set ctx_state for tools + from types import SimpleNamespace + workflow_state = SimpleNamespace(original_input=message, info=message.info) + ctx_state.set(workflow_state) + + logger.info("build_agent: starting L2 investigation") + + ctx = message.info.checker_context + if not ctx or not ctx.l1_result: + logger.warning("build_agent: no L1 result available, skipping L2") + return message + + if not ctx.artifacts: + logger.warning("build_agent: no artifacts available, skipping L2") + return message + + has_build_log = bool(ctx.artifacts.build_log_path) + if not has_build_log: + logger.info("build_agent: no build log, will use spec/build system files only") + + # Build and run the graph + build_agent_graph = await create_graph_build_agent(config, builder, message, tracer) + initial_state: BuildAgentState = { + "messages": [HumanMessage(content="Begin L2 build analysis")], + "step": 0, + "max_steps": config.max_iterations, + } + + with tracer.push_active_function("l2_build_agent_graph", input_data=initial_state["messages"][0].content): + # Each phase: (max_iterations * 3 react nodes) + data_harvest/forced_finish/investigation_phase + # Two phases (CONFIG + HARDENING) when code is compiled + steps_per_phase = (config.max_iterations * 3) + 4 + recursion_limit = steps_per_phase * 2 + 5 # buffer for edge cases + result = await build_agent_graph.ainvoke( + initial_state, + config={"recursion_limit": recursion_limit}, + ) + + logger.info("build_agent: L2 investigation finished") + + # Extract verdict from result + compile_verdict = result.get("L2CompileVerdict") or None + if compile_verdict is None: + logger.error("build_agent: L2CompileVerdict is none should never happen") + return message + hardening_verdict = result.get("L2HardeningVerdict") or None + harvest_report = result.get("harvest_report") or BuildHarvestReport() + hardening_reason = None + hardening_flags = [] + if compile_verdict.compilation_status == "not_compiled": + hardening_relevant = False + l2_override_verdict = "not_vulnerable" + elif hardening_verdict is None: + # Hardening was skipped - could be kernel package or architecture mismatch + hardening_relevant = False + if harvest_report.is_kernel: + # Kernel package: hardening skipped because RHEL kernels always have it + # Code IS compiled, so defer to L1 verdict (no L2 override) + l2_override_verdict = None + else: + # Architecture mismatch case: compiled but not exploitable on this arch + l2_override_verdict = "not_vulnerable" + else: + hardening_relevant = True + hardening_reason = hardening_verdict.reasoning + hardening_flags = hardening_verdict.hardening_flags or [] + if hardening_verdict.hardening_status == "mitigated": + l2_override_verdict = "vulnerable_mitigated" + else: + l2_override_verdict = None + + # Build L2 result + evidence_sources = result.get("evidence_sources") or [] + l2_result = L2BuildResult( + compilation_status=compile_verdict.compilation_status, + compilation_confidence=compile_verdict.confidence, + compilation_evidence=compile_verdict.reasoning, + hardening_relevant=hardening_relevant, + hardening_flags=hardening_flags, + hardening_rationale=hardening_reason, + l2_override_verdict=l2_override_verdict, + evidence_sources=evidence_sources, + ) + + with tracer.push_active_function( + "l2_agent_finish", + input_data={"compilation_status": l2_result.compilation_status}, + ) as span: + span.set_output({ + "compilation_status": l2_result.compilation_status, + "compilation_confidence": l2_result.compilation_confidence, + "l2_override_verdict": l2_result.l2_override_verdict, + }) + + # Store result on checker_context + if message.info.checker_context is not None: + message.info.checker_context.l2_result = l2_result + else: + logger.warning("build_agent: checker_context is None, cannot store l2_result") + + logger.info( + "build_agent: L2 result - status=%s, confidence=%.2f, override=%s", + l2_result.compilation_status, + l2_result.compilation_confidence, + l2_result.l2_override_verdict, + ) + + return message + + yield FunctionInfo.from_fn( + _arun, + description="Level 2 Build Agent: analyzes build artifacts for compilation status and hardening", + ) diff --git a/src/vuln_analysis/functions/cve_checker_report.py b/src/vuln_analysis/functions/cve_checker_report.py new file mode 100644 index 000000000..4e70b5ae4 --- /dev/null +++ b/src/vuln_analysis/functions/cve_checker_report.py @@ -0,0 +1,1117 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +CVE Checker Report Generation Function. + +This module provides the report generation node for the L1/L2 pipeline. +It consumes L1InvestigationResult (and optionally L2BuildResult) from +checker_context and produces the final AgentMorpheusOutput. +""" + +import re +import warnings +from dataclasses import dataclass, field +from pathlib import Path +from typing import Literal + +from aiq.builder.builder import Builder +from aiq.builder.framework_enum import LLMFrameworkEnum +from aiq.builder.function_info import FunctionInfo +from aiq.cli.register_workflow import register_function +from aiq.data_models.function import FunctionBaseConfig +from pydantic import Field + +from exploit_iq_commons.logging.loggers_factory import LoggingFactory, trace_id +from exploit_iq_commons.data_models.input import AgentMorpheusEngineInput +from exploit_iq_commons.data_models.checker_status import L1InvestigationResult, L2BuildResult + +from nat.builder.context import Context +from vuln_analysis.data_models.output import ( + AgentMorpheusEngineOutput, + AgentMorpheusOutput, + JustificationOutput, + OutputPayload, +) +from vuln_analysis.functions.code_agent_graph_defs import ( + CodeAgentReport, + CodeSnippet, + DownstreamSearchReport, + ParsedPatch, + UpstreamSearchReport, + generate_code_agent_report, +) + +logger = LoggingFactory.get_agent_logger(__name__) + + +_StatusLiteral = Literal["TRUE", "FALSE", "UNKNOWN"] + +_JUSTIFICATION_LABEL_TO_STATUS: dict[str, _StatusLiteral] = { + "code_not_present": "FALSE", + "code_not_reachable": "FALSE", + "protected_by_mitigating_control": "FALSE", + "protected_by_compiler": "FALSE", + "requires_environment": "FALSE", + "vulnerable": "TRUE", + "uncertain": "UNKNOWN", +} + + +_POLICY_MAX_RPM_LIST_ITEMS = 5 +_POLICY_RHSA_STATEMENT_CAP = 400 +_POLICY_MAX_PACKAGE_STATE_ITEMS = 8 + +_BUILD_LOG_EXCERPT_MAX_LINES = 8 +_EVIDENCE_CHAIN_ITEM_CAP = 8 + + +def _md_section_header(title: str) -> list[str]: + """UI-friendly section break: horizontal rule + level-2 heading (no tables).""" + return ["---", "", f"## {title}", ""] + + +def _md_note(text: str) -> list[str]: + """Callout line for interpretation; renders as blockquote in most markdown UIs.""" + return [f"> **Note:** {text}", ""] + + +def _md_check_result(passed: bool) -> str: + return "**Pass**" if passed else "**Fail**" + + +def _first_nonempty_line(text: str, *, max_len: int = 140) -> str: + for raw in text.splitlines(): + line = raw.strip() + if line: + if len(line) > max_len: + return line[: max_len - 1] + "…" + return line + return "" + + +def _spec_declaration_summary(directives: list[str]) -> str: + for directive in directives: + stripped = directive.strip() + if stripped.startswith("Patch"): + return stripped + return directives[0].strip() if directives else "" + + +def _is_redundant_patch_evidence(evidence: str) -> bool: + """Skip LLM bullets that repeat the patch/spec/build tri-state checks.""" + ev_lower = evidence.lower() + redundant_phrases = ( + "target package patch available", + "target patch file referenced in spec", + "referenced in spec", + "build log evidence", + "patch file exists", + "applied in build", + "patch file on target", + ) + return any(phrase in ev_lower for phrase in redundant_phrases) + + +def _categorize_evidence_chain( + evidence_chain: list[str], +) -> tuple[list[str], list[str], list[str], list[str]]: + patch_evidence: list[str] = [] + build_evidence: list[str] = [] + code_evidence: list[str] = [] + other_evidence: list[str] = [] + + patch_keywords = ("patch", "spec", "patchn", "directive", "target", "reference", "rebase") + build_keywords = ("build", "applied", "log", "compilation") + code_keywords = ("code agent", "code analysis", "function", "vulnerable", "source search") + + for ev in evidence_chain[:_EVIDENCE_CHAIN_ITEM_CAP]: + ev_lower = ev.lower() + if any(kw in ev_lower for kw in code_keywords): + code_evidence.append(ev) + elif any(kw in ev_lower for kw in build_keywords): + if not _is_redundant_patch_evidence(ev): + build_evidence.append(ev) + elif any(kw in ev_lower for kw in patch_keywords): + if _is_redundant_patch_evidence(ev): + continue + patch_evidence.append(ev) + else: + other_evidence.append(ev) + + return patch_evidence, build_evidence, code_evidence, other_evidence + + +def _target_nvr(blocks: "ReportBlocks") -> str: + version_release = ( + f"{blocks.package_version}-{blocks.package_release}" + if blocks.package_release + else blocks.package_version + ) + return f"{blocks.package_name}-{version_release} ({blocks.package_arch})" + + +def _derive_investigation_mode( + downstream_report: DownstreamSearchReport | None, + upstream_report: UpstreamSearchReport | None, +) -> str: + if downstream_report and downstream_report.is_patch_file_available: + return "patch_available" + if upstream_report and upstream_report.is_code_fixed_by_rebase == "yes": + return "rebase" + if upstream_report and upstream_report.fixed_parsed_patch: + return "upstream_patch" + return "no_patch" + + +def _derive_arch_gate_reason( + l1_agent_answer: str | None, + affected_bitness: str | None, + package_arch: str, +) -> str | None: + if l1_agent_answer: + answer_lower = l1_agent_answer.lower() + if "bitness mismatch" in answer_lower or ( + "architecture" in answer_lower and "mismatch" in answer_lower + ): + return l1_agent_answer.strip() + if not affected_bitness or affected_bitness == "both": + return None + arch_lower = package_arch.lower() + is_64 = any(token in arch_lower for token in ("x86_64", "amd64", "aarch64", "64")) + is_32 = any(token in arch_lower for token in ("i686", "i386", "32")) + if affected_bitness == "32-bit" and is_64 and not is_32: + return f"CVE affects 32-bit only; target package is {package_arch}." + if affected_bitness == "64-bit" and is_32 and not is_64: + return f"CVE affects 64-bit only; target package is {package_arch}." + return None + + +def _target_fix_search_interpretation(blocks: "ReportBlocks") -> str: + checks = ( + blocks.is_patch_file_available, + blocks.is_patch_in_spec_file, + blocks.is_patch_applied_in_build, + ) + passed = sum(1 for c in checks if c) + if passed == 3: + if blocks.justification_label == "vulnerable": + return ( + "Target fix search (3/3): patch file, spec reference, and build log all match, " + "but vulnerable patterns remain in source—see target source check below." + ) + return ( + "Target fix search (3/3): CVE patch file present, referenced in the spec, " + "and observed in the build log." + ) + if passed == 0: + return ( + "Target fix search (0/3): no CVE-named patch file, spec reference, " + "or build-time application found on the target package." + ) + gaps: list[str] = [] + if not blocks.is_patch_file_available: + gaps.append("no CVE patch file on target") + if not blocks.is_patch_in_spec_file: + gaps.append("not referenced in target spec") + if not blocks.is_patch_applied_in_build: + gaps.append("not observed in target build log") + return ( + f"Target fix search ({passed}/3 checks passed): " + f"{'; '.join(gaps)}." + ) + + +def _summarize_parsed_patch_lines(parsed_patch: ParsedPatch | None) -> list[str]: + if not parsed_patch: + return [] + lines: list[str] = [] + if parsed_patch.patch_filename: + lines.append(f"- **Patch file:** `{parsed_patch.patch_filename}`") + for pf in parsed_patch.files[:2]: + path = pf.target_path.lstrip("ab/") + lines.append(f"- **File in patch:** `{path}`") + for hunk in pf.hunks[:1]: + if hunk.removed_lines: + lines.append(f" - Removed: `{hunk.removed_lines[0].strip()}`") + if hunk.added_lines: + lines.append(f" - Added: `{hunk.added_lines[0].strip()}`") + return lines + + +def _format_target_fix_search_md(blocks: "ReportBlocks") -> str: + lines: list[str] = [] + lines.extend(_md_section_header(f"CVE fix search on target package ({_target_nvr(blocks)})")) + + patch_detail = f"`{blocks.patch_file_name}`" if blocks.patch_file_name else "_not found_" + spec_detail = ( + f"`{_spec_declaration_summary(blocks.spec_patch_directives)}`" + if blocks.spec_patch_directives + else ("_CVE mentioned in spec_" if blocks.is_patch_in_spec_file else "_not referenced_") + ) + build_detail = _first_nonempty_line(blocks.build_log_evidence) or "_not observed in build log_" + if build_detail and not build_detail.startswith("`") and build_detail != "_not observed in build log_": + build_detail = f"`{build_detail}`" + + lines.append( + f"- **Patch file on target:** {_md_check_result(blocks.is_patch_file_available)} — {patch_detail}" + ) + lines.append( + f"- **Referenced in spec:** {_md_check_result(blocks.is_patch_in_spec_file)} — {spec_detail}" + ) + lines.append( + f"- **Applied in build:** {_md_check_result(blocks.is_patch_applied_in_build)} — {build_detail}" + ) + lines.append("") + lines.extend(_md_note(_target_fix_search_interpretation(blocks))) + + if blocks.spec_patch_directives: + declarations = [d for d in blocks.spec_patch_directives if d.strip().startswith("Patch")] + applications = [d for d in blocks.spec_patch_directives if d.strip().startswith("%patch")] + if declarations: + lines.append("- **Spec patch declaration:**") + for decl in declarations: + lines.append(f" - `{decl.strip()}`") + if applications: + lines.append("- **Spec patch application:**") + for app in applications: + lines.append(f" - `{app.strip()}`") + if blocks.spec_changelog_cve_lines.strip(): + lines.append("- **Spec changelog (CVE):**") + for changelog_line in blocks.spec_changelog_cve_lines.strip().splitlines(): + stripped = changelog_line.strip() + if stripped: + lines.append(f" - `{stripped}`") + if blocks.spec_version_line or blocks.spec_source0_line: + if blocks.spec_version_line: + lines.append(f"- **Spec version line:** `{blocks.spec_version_line.strip()}`") + if blocks.spec_source0_line: + lines.append(f"- **Spec Source0 line:** `{blocks.spec_source0_line.strip()}`") + lines.append( + "> **Note:** Version/Source0 describe the upstream snapshot named in the spec; " + "they may differ from the tree used for source review." + ) + lines.append("") + return "\n".join(lines).strip() + + +def _format_external_fix_clues_md(blocks: "ReportBlocks") -> str: + lines: list[str] = [] + lines.extend(_md_section_header("Fix clues from advisories and reference builds")) + + if blocks.investigation_mode == "patch_available": + lines.append( + "- **Status:** Not needed — a CVE-named patch file was found on the target package." + ) + lines.append("") + return "\n".join(lines).strip() + + if not blocks.upstream_search_ran: + lines.append( + "- **Status:** Not evaluated — external reference search did not run " + "(target patch workflow did not require it)." + ) + lines.append("") + return "\n".join(lines).strip() + + upstream = blocks.upstream_report + clue_lines: list[str] = [] + + if blocks.fixed_rpm_hints: + shown = blocks.fixed_rpm_hints[:_POLICY_MAX_RPM_LIST_ITEMS] + clue_lines.append("- **Known-fixed packages (intel):**") + for hint in shown: + clue_lines.append(f" - `{hint}`") + + if upstream: + if upstream.reference_package_nvr: + clue_lines.append( + f"- **Reference package NVR:** `{upstream.reference_package_nvr}`" + ) + if upstream.fixed_srpm_file_name: + clue_lines.append( + f"- **Reference patch source:** `{upstream.fixed_srpm_file_name}`" + ) + clue_lines.extend(_summarize_parsed_patch_lines(upstream.fixed_parsed_patch)) + if upstream.osv_result: + osv = upstream.osv_result + if osv.patch_url: + clue_lines.append(f"- **Fetched patch URL:** {osv.patch_url}") + if osv.fixed_commit: + clue_lines.append(f"- **Fixed commit:** `{osv.fixed_commit}`") + if osv.source: + clue_lines.append(f"- **Patch source:** {osv.source}") + if upstream.is_code_fixed_by_rebase == "yes": + clue_lines.append("- **Rebase on target:** Spec/changelog suggests fix via version rebase.") + if upstream.spec_file_log_change.strip(): + excerpt = _first_nonempty_line(upstream.spec_file_log_change, max_len=200) + if excerpt: + clue_lines.append(f" - `{excerpt}`") + if upstream.reason_code_fixed_by_rebase.strip(): + clue_lines.append(f" - {upstream.reason_code_fixed_by_rebase.strip()}") + + if clue_lines: + lines.extend(clue_lines) + else: + lines.append( + "- **Status:** No external fix clue found — advisories and reference builds " + "did not yield a usable patch for comparison." + ) + lines.append("") + return "\n".join(lines).strip() + + +def _format_target_source_check_md(blocks: "ReportBlocks") -> str: + _, _, code_evidence, _ = _categorize_evidence_chain(blocks.evidence_chain) + lines: list[str] = [] + lines.extend(_md_section_header("Target source check")) + + if blocks.l1_agent_answer: + lines.append(f"- **Code agent summary:** {blocks.l1_agent_answer.strip()}") + + if blocks.affected_files: + lines.append("- **Affected source files:**") + for f in blocks.affected_files[:10]: + lines.append(f" - `{f}`") + elif blocks.arch_gate_reason: + lines.append( + f"- **Status:** Source review limited — {blocks.arch_gate_reason}" + ) + else: + lines.append("- **Affected source files:** _not determined_") + + mitigated = blocks.justification_label in ( + "protected_by_mitigating_control", + "protected_by_compiler", + "code_not_present", + "code_not_reachable", + "requires_environment", + ) + vulnerable_snippets = _filter_review_snippets(blocks.vulnerable_snippets) + fix_snippets = _filter_review_snippets(blocks.fix_snippets) + + if vulnerable_snippets: + title = "Pattern before fix (reference or target)" if mitigated else "Vulnerable pattern on target" + lines.append(f"- **{title}:**") + lines.extend(blocks._format_snippet_bullets(vulnerable_snippets)) + elif not blocks.arch_gate_reason: + lines.append("- **Vulnerable pattern on target:** _not shown — no source excerpt collected_") + + if fix_snippets: + lines.append("- **Fix pattern (reference or expected on target):**") + lines.extend(blocks._format_snippet_bullets(fix_snippets)) + elif vulnerable_snippets or blocks.l1_agent_answer: + lines.append("- **Fix pattern:** _not shown — no fix excerpt collected_") + + for ev in code_evidence[:2]: + if not _is_redundant_patch_evidence(ev): + lines.append(f"- {ev}") + + lines.append("") + return "\n".join(lines).strip() + + +def _format_target_build_check_md(blocks: "ReportBlocks") -> str: + lines: list[str] = [] + lines.extend(_md_section_header("Target build check")) + + l2 = blocks.l2_result + if l2 is None: + lines.append("- **Status:** Not evaluated — build agent did not run for this scan.") + lines.append("") + return "\n".join(lines).strip() + + lines.append(f"- **Compilation:** {l2.compilation_status or 'unknown'}") + if l2.compilation_evidence: + lines.append(f"- **Compilation evidence:** {l2.compilation_evidence.strip()}") + if l2.hardening_flags: + flags_str = ", ".join(l2.hardening_flags[:5]) + lines.append(f"- **Compiler hardening:** `{flags_str}`") + if l2.hardening_rationale: + lines.append(f"- **Hardening rationale:** {l2.hardening_rationale.strip()}") + if l2.l2_override_verdict: + lines.append(f"- **Build agent override:** `{l2.l2_override_verdict}`") + elif l2.hardening_relevant is False: + lines.append("- **Hardening relevant to CVE:** no") + lines.append("") + return "\n".join(lines).strip() + + +def _filter_review_snippets(snippets: list[CodeSnippet]) -> list[CodeSnippet]: + """Prefer primary source files over build-system noise when both are present.""" + primary = [ + s for s in snippets + if s.file_path.endswith((".c", ".h", ".cpp", ".cc", ".cxx")) + and "/test/" not in s.file_path + and "CMakeLists" not in s.file_path + and "Makefile" not in s.file_path + ] + return primary if primary else snippets + + +def _is_reference_tree_path(file_path: str) -> bool: + path_lower = file_path.lower() + return "_patched" in path_lower or "/patched/" in path_lower + + +@dataclass +class ReportBlocks: + """Formatted report blocks - each piece of data formatted once for UI output.""" + + # Package info + package_name: str + package_version: str + package_release: str + package_arch: str + + # CVE info + cve_id: str + cve_description: str + + # Verdict + justification_label: str + executive_summary: str + + # Evidence + evidence_chain: list[str] + affected_files: list[str] + + # Extracted facts from downstream search + patch_file_name: str + spec_patch_directives: list[str] + build_log_evidence: str + spec_changelog_cve_lines: str + spec_version_line: str + spec_source0_line: str + is_patch_file_available: bool + is_patch_in_spec_file: bool + is_patch_applied_in_build: bool + + # Code snippets + vulnerable_snippets: list[CodeSnippet] + fix_snippets: list[CodeSnippet] + + # Investigation context (upstream / L1 / L2) + upstream_report: UpstreamSearchReport | None = None + l1_agent_answer: str | None = None + investigation_mode: str | None = None + arch_gate_reason: str | None = None + l2_result: L2BuildResult | None = None + fixed_rpm_hints: list[str] = field(default_factory=list) + upstream_search_ran: bool = False + + @property + def package_header_md(self) -> str: + """Format package metadata as Markdown header.""" + version_release = f"{self.package_version}-{self.package_release}" if self.package_release else self.package_version + return f"**Package:** `{self.package_name}-{version_release}` ({self.package_arch})" + + @property + def evidence_chain_md(self) -> str: + """Investigation narrative for the details field (four sections, always present).""" + sections = [ + _format_target_fix_search_md(self), + _format_external_fix_clues_md(self), + _format_target_source_check_md(self), + _format_target_build_check_md(self), + ] + return "\n\n".join(s for s in sections if s).strip() + + @property + def affected_files_md(self) -> str: + """Format affected files as Markdown list.""" + if not self.affected_files: + return "" + lines = ["**Affected Files:**"] + for f in self.affected_files[:10]: + lines.append(f"- `{f}`") + return "\n".join(lines) + + @property + def details_md(self) -> str: + """Source snippets are rendered inside evidence_chain_md (target source check).""" + return "" + + def _format_snippet_bullets(self, snippets: list[CodeSnippet]) -> list[str]: + lines: list[str] = [] + for snippet in snippets: + display_path = snippet.file_path + if len(display_path) > 72: + display_path = Path(display_path).name + line_part = f" (line {snippet.line_number})" if snippet.line_number else "" + lines.append(f"### `{display_path}`{line_part}") + if _is_reference_tree_path(snippet.file_path): + lines.append( + "> **Note:** Excerpt path is from a reference/patched tree, not the target SRPM source." + ) + lines.append("") + lang = _infer_language_from_path(snippet.file_path) + code_body = "\n".join(snippet.code.strip().splitlines()[:12]) + if code_body: + lines.append(f"```{lang}") + lines.append(code_body) + lines.append("```") + lines.append("") + return lines + +_NO_PATCH_EVIDENCE_PHRASE = ( + "no CVE fix evidence was found on the target (no patch file, spec reference, " + "or build-time application)" +) + +_REASON_BY_LABEL: dict[str, str] = { + "vulnerable": ( + "Labeled vulnerable because the code agent confirmed vulnerable patterns " + "remain in source." + ), + "protected_by_mitigating_control": ( + "Labeled protected by mitigating control because a downstream patch or fix pattern " + "addresses the vulnerability in this package." + ), + "protected_by_compiler": ( + "Labeled protected by compiler because the build agent found compiler hardening " + "that mitigates exploitation of this flaw." + ), + "code_not_present": ( + "Labeled code not present because the build agent determined the vulnerable code " + "is not compiled into this package." + ), + "code_not_reachable": ( + "Labeled code not reachable because the vulnerable code path is not reachable " + "in this package build." + ), + "requires_environment": ( + "Labeled requires environment because exploitation depends on runtime or architecture " + "conditions not met on this target." + ), + "uncertain": ( + "Labeled uncertain because patch, spec, build, and source evidence are " + "insufficient or conflicting." + ), +} + + +def _all_patch_checks_failed(blocks: ReportBlocks) -> bool: + return not ( + blocks.is_patch_file_available + or blocks.is_patch_in_spec_file + or blocks.is_patch_applied_in_build + ) + + +def _format_justification_reason(blocks: ReportBlocks) -> str: + """One-sentence VEX label rationale (outcome voice); evidence lives in details.""" + label = blocks.justification_label + if label == "vulnerable": + if _all_patch_checks_failed(blocks): + return ( + f"Labeled vulnerable because {_NO_PATCH_EVIDENCE_PHRASE}, and the code agent " + "confirmed vulnerable patterns remain in source." + ) + return _REASON_BY_LABEL["vulnerable"] + if label == "uncertain": + if _all_patch_checks_failed(blocks): + return ( + f"Labeled uncertain because {_NO_PATCH_EVIDENCE_PHRASE}, and source evidence " + "is insufficient or conflicting." + ) + return _REASON_BY_LABEL["uncertain"] + return _REASON_BY_LABEL.get(label, _REASON_BY_LABEL["uncertain"]) + + +_BUILD_AGENT_ABSENCE_PHRASES = ( + "build agent context is not present", + "l2 context is not present", + "l2_build_context is not present", + "based solely on the code agent analysis", + "verdict is based solely on the code agent", + "because the build agent context is not present", + "build agent did not run", + "no build agent context", +) + + +def _strip_build_agent_absence_boilerplate(text: str) -> str: + """Remove LLM sentences that misstate build-agent availability.""" + parts = re.split(r"(?<=[.!?])\s+", text.strip()) + kept = [ + part + for part in parts + if part and not any(phrase in part.lower() for phrase in _BUILD_AGENT_ABSENCE_PHRASES) + ] + return " ".join(kept).strip() + + +def _infer_language_from_path(file_path: str) -> str: + """Infer programming language hint from file extension for syntax highlighting.""" + ext_map = { + ".c": "c", + ".h": "c", + ".cpp": "cpp", + ".hpp": "cpp", + ".cc": "cpp", + ".cxx": "cpp", + ".py": "python", + ".go": "go", + ".rs": "rust", + ".java": "java", + ".js": "javascript", + ".ts": "typescript", + ".rb": "ruby", + ".sh": "bash", + } + return ext_map.get(Path(file_path).suffix.lower(), "c") + + +def _build_details_md(blocks: ReportBlocks) -> str | None: + """Audit packet: investigation narrative (four sections, includes source excerpts).""" + investigation = blocks.evidence_chain_md.strip() + return investigation or None + + +def _build_report_blocks( + message: AgentMorpheusEngineInput, + code_agent_report: CodeAgentReport, + cve_description: str, + downstream_report: DownstreamSearchReport | None, + upstream_report: UpstreamSearchReport | None = None, + l1_result: L1InvestigationResult | None = None, + l2_result: L2BuildResult | None = None, + identify_result=None, +) -> ReportBlocks: + """Extract and format all report data into blocks.""" + target_package = message.input.image.target_package + + vulnerable_snippets = [s for s in code_agent_report.code_snippets if s.snippet_type == "vulnerable"] + fix_snippets = [s for s in code_agent_report.code_snippets if s.snippet_type == "fix"] + + patch_file_name = "" + spec_patch_directives: list[str] = [] + build_log_evidence = "" + spec_changelog_cve_lines = "" + spec_version_line = "" + spec_source0_line = "" + is_patch_file_available = False + is_patch_in_spec_file = False + is_patch_applied_in_build = False + + if downstream_report: + patch_file_name = downstream_report.patch_file_name or "" + spec_patch_directives = downstream_report.spec_patch_directives_for_cve or [] + build_log_evidence = downstream_report.build_log_patch_applied or "" + spec_changelog_cve_lines = downstream_report.spec_changelog_cve_lines or "" + spec_version_line = downstream_report.spec_version_line or "" + spec_source0_line = downstream_report.spec_source0_line or "" + is_patch_file_available = downstream_report.is_patch_file_available + is_patch_in_spec_file = downstream_report.is_patch_in_spec_file + is_patch_applied_in_build = downstream_report.is_patch_applied_in_build + + investigation_mode = _derive_investigation_mode(downstream_report, upstream_report) + upstream_search_ran = not ( + downstream_report is not None and downstream_report.is_patch_file_available + ) + + l1_agent_answer = l1_result.l1_agent_answer if l1_result else None + affected_bitness = None + if l1_result and l1_result.vulnerability_intel: + affected_bitness = l1_result.vulnerability_intel.affected_bitness + + package_arch = target_package.arch or "x86_64" if target_package else "x86_64" + arch_gate_reason = _derive_arch_gate_reason(l1_agent_answer, affected_bitness, package_arch) + + fixed_rpm_hints: list[str] = [] + if identify_result and identify_result.fixed_rpm_list: + fixed_rpm_hints = list(identify_result.fixed_rpm_list[:_POLICY_MAX_RPM_LIST_ITEMS]) + + return ReportBlocks( + package_name=target_package.name if target_package else "unknown", + package_version=target_package.version or "" if target_package else "", + package_release=target_package.release or "" if target_package else "", + package_arch=package_arch, + cve_id=message.input.scan.vulns[0].vuln_id if message.input.scan.vulns else "", + cve_description=cve_description, + justification_label=code_agent_report.justification_label, + executive_summary=code_agent_report.executive_summary, + evidence_chain=list(code_agent_report.evidence_chain), + affected_files=list(code_agent_report.affected_files), + patch_file_name=patch_file_name, + spec_patch_directives=spec_patch_directives, + build_log_evidence=build_log_evidence, + spec_changelog_cve_lines=spec_changelog_cve_lines, + spec_version_line=spec_version_line, + spec_source0_line=spec_source0_line, + is_patch_file_available=is_patch_file_available, + is_patch_in_spec_file=is_patch_in_spec_file, + is_patch_applied_in_build=is_patch_applied_in_build, + vulnerable_snippets=vulnerable_snippets, + fix_snippets=fix_snippets, + upstream_report=upstream_report, + l1_agent_answer=l1_agent_answer, + investigation_mode=investigation_mode, + arch_gate_reason=arch_gate_reason, + l2_result=l2_result, + fixed_rpm_hints=fixed_rpm_hints, + upstream_search_ran=upstream_search_ran, + ) + + +def _format_policy_context_for_report( + *, + target_nvr: str, + identify_result, + intel, +) -> str: + """Build a context block for the LLM prompt covering NVR posture and RHSA excerpts.""" + lines: list[str] = [] + + if target_nvr: + lines.append(f"**Scanned target NVR:** `{target_nvr}`") + + if identify_result: + affected = identify_result.affected_rpm_list or [] + fixed = identify_result.fixed_rpm_list or [] + + if affected: + shown = affected[:_POLICY_MAX_RPM_LIST_ITEMS] + suffix = f" (+ {len(affected) - len(shown)} more)" if len(affected) > len(shown) else "" + lines.append(f"**Affected NVRs from identify:** {', '.join(f'`{n}`' for n in shown)}{suffix}") + lines.append(f" - is_target_package_affected: `{identify_result.is_target_package_affected.value}`") + + if fixed: + shown = fixed[:_POLICY_MAX_RPM_LIST_ITEMS] + suffix = f" (+ {len(fixed) - len(shown)} more)" if len(fixed) > len(shown) else "" + lines.append(f"**Fixed NVRs from identify:** {', '.join(f'`{n}`' for n in shown)}{suffix}") + lines.append(f" - is_target_package_fixed: `{identify_result.is_target_package_fixed.value}`") + + rhsa = None + if intel and len(intel) > 0: + rhsa = intel[0].rhsa + + if rhsa: + if rhsa.statement: + stmt = rhsa.statement + if len(stmt) > _POLICY_RHSA_STATEMENT_CAP: + stmt = stmt[:_POLICY_RHSA_STATEMENT_CAP] + " …" + lines.append(f"**RHSA statement excerpt:** {stmt}") + + if rhsa.upstream_fix: + lines.append(f"**RHSA upstream_fix:** `{rhsa.upstream_fix}`") + + pkg_states = rhsa.package_state or [] + if pkg_states: + lines.append("**RHSA package_state:**") + for ps in pkg_states[:_POLICY_MAX_PACKAGE_STATE_ITEMS]: + parts = [] + if ps.product_name: + parts.append(ps.product_name) + if ps.package_name: + parts.append(f"pkg={ps.package_name}") + if ps.fix_state: + parts.append(f"fix_state={ps.fix_state}") + if parts: + lines.append(f" - {' | '.join(parts)}") + if len(pkg_states) > _POLICY_MAX_PACKAGE_STATE_ITEMS: + lines.append(f" - (+ {len(pkg_states) - _POLICY_MAX_PACKAGE_STATE_ITEMS} more)") + + return "\n".join(lines) + + +def _apply_l2_verdict( + report: CodeAgentReport, + l2_result: L2BuildResult, +) -> CodeAgentReport: + """Apply L2 Build Agent verdict overrides to the CodeAgentReport. + + .. deprecated:: + This function is deprecated. L2 results are now passed directly to + `generate_code_agent_report()` so the LLM can synthesize L1 and L2 + findings into a cohesive narrative. This function will be removed + in a future release. + """ + warnings.warn( + "_apply_l2_verdict is deprecated. L2 results are now integrated " + "directly into the LLM prompt via generate_code_agent_report().", + DeprecationWarning, + stacklevel=2, + ) + if l2_result.l2_override_verdict is None: + return report + + updated_fields = {} + + if l2_result.l2_override_verdict == "not_vulnerable": + evidence = l2_result.compilation_evidence or "" + if "Architecture mismatch" in evidence: + # Architecture-based not affected - vulnerability cannot occur on this platform + updated_fields["justification_label"] = "requires_environment" + updated_fields["executive_summary"] = ( + f"{report.executive_summary}\n\n" + f"**L2 Override:** {evidence} " + f"Vulnerability condition cannot occur on this architecture." + ) + elif l2_result.compilation_status == "not_compiled": + updated_fields["justification_label"] = "code_not_present" + updated_fields["executive_summary"] = ( + f"{report.executive_summary}\n\n" + f"**L2 Override:** Vulnerable code is NOT compiled into the binary. " + f"Evidence: {evidence or 'Build analysis confirmed exclusion.'}" + ) + else: + updated_fields["justification_label"] = "code_not_reachable" + updated_fields["executive_summary"] = ( + f"{report.executive_summary}\n\n" + f"**L2 Override:** Code determined not vulnerable by L2 analysis." + ) + + elif l2_result.l2_override_verdict == "vulnerable_mitigated": + if l2_result.hardening_relevant and l2_result.hardening_flags: + updated_fields["justification_label"] = "protected_by_compiler" + flags_str = ", ".join(l2_result.hardening_flags[:5]) + updated_fields["executive_summary"] = ( + f"{report.executive_summary}\n\n" + f"**L2 Override:** Vulnerability mitigated by compiler hardening flags: {flags_str}. " + f"Rationale: {l2_result.hardening_rationale or 'Hardening flags provide protection.'}" + ) + else: + updated_fields["justification_label"] = "protected_by_mitigating_control" + updated_fields["executive_summary"] = ( + f"{report.executive_summary}\n\n" + f"**L2 Override:** Vulnerability mitigated by build-time controls." + ) + + if updated_fields: + evidence = list(report.evidence_chain) + evidence.append(f"L2 Build Agent: {l2_result.l2_override_verdict}") + if l2_result.compilation_evidence: + evidence.append(f"L2 compilation evidence: {l2_result.compilation_evidence}") + if l2_result.hardening_rationale: + evidence.append(f"L2 hardening rationale: {l2_result.hardening_rationale}") + updated_fields["evidence_chain"] = evidence + + return report.model_copy(update=updated_fields) + + return report + + +def _build_analysis( + message: AgentMorpheusEngineInput, + code_agent_report: CodeAgentReport, + intel_score: int, + cve_description: str = "", + downstream_report: DownstreamSearchReport | None = None, + upstream_report: UpstreamSearchReport | None = None, + l1_result: L1InvestigationResult | None = None, + l2_result: L2BuildResult | None = None, +) -> list[AgentMorpheusEngineOutput]: + """Build the final analysis output from the code agent report using ReportBlocks. + + - summary: LLM executive_summary (verdict, reconciliation, technical context) + - justification.reason: one-sentence template from label and patch booleans + - details: investigation narrative markdown (target fix search through build check) + """ + ctx = message.info.checker_context if message.info else None + blocks = _build_report_blocks( + message, + code_agent_report, + cve_description, + downstream_report, + upstream_report=upstream_report, + l1_result=l1_result, + l2_result=l2_result, + identify_result=ctx.identify_result if ctx else None, + ) + + label = blocks.justification_label + status: _StatusLiteral = _JUSTIFICATION_LABEL_TO_STATUS.get(label, "UNKNOWN") + + summary = _strip_build_agent_absence_boilerplate(blocks.executive_summary.strip()) + reason = _format_justification_reason(blocks) + details = _build_details_md(blocks) + + return [ + AgentMorpheusEngineOutput( + vuln_id=intel.vuln_id, + checklist=[], + summary=summary, + justification=JustificationOutput( + label=label, + reason=reason, + status=status, + ), + intel_score=intel_score, + cvss=None, + details=details, + ) + for intel in (message.info.intel if message.info and message.info.intel else []) + ] + + +class CVECheckerReportConfig(FunctionBaseConfig, name="cve_checker_report"): + """Configuration for the CVE Checker Report generation function.""" + base_checker_dir: str = Field( + default=".cache/am_cache/checker", + description="Root directory for checker-specific artifacts.", + ) + llm_name: str = Field(description="The LLM model to use for report generation.") + + +@register_function(config_type=CVECheckerReportConfig, framework_wrappers=[LLMFrameworkEnum.LANGCHAIN]) +async def cve_checker_report(config: CVECheckerReportConfig, builder: Builder): + """Report generation function for the L1/L2 checker pipeline.""" + + async def _arun(message: AgentMorpheusEngineInput) -> AgentMorpheusOutput: + """Generate the final checker report from L1 (and optionally L2) results.""" + trace_id.set(message.input.scan.id) + tracer = Context.get() + + logger.info("cve_checker_report: starting report generation") + + ctx = message.info.checker_context + if ctx is None or ctx.l1_result is None: + logger.error("cve_checker_report: no L1 result available") + return AgentMorpheusOutput( + input=message.input, + info=message.info, + output=OutputPayload( + analysis=[ + AgentMorpheusEngineOutput( + vuln_id=intel.vuln_id, + checklist=[], + summary="Rpm scanning investigation did not produce results.", + justification=JustificationOutput( + label="uncertain", + reason="Rpm scanning investigation did not produce results.", + status="UNKNOWN", + ), + intel_score=0, + cvss=None, + details=None, + ) + for intel in (message.info.intel if message.info and message.info.intel else []) + ], + vex=None, + ), + ) + + l1_result = ctx.l1_result + l2_result = ctx.l2_result + + downstream_report: DownstreamSearchReport | None = None + upstream_report: UpstreamSearchReport | None = None + + if l1_result.downstream_report: + downstream_report = DownstreamSearchReport.model_validate(l1_result.downstream_report) + if l1_result.upstream_report: + upstream_report = UpstreamSearchReport.model_validate(l1_result.upstream_report) + + vuln_id = message.input.scan.vulns[0].vuln_id + target_package = message.input.image.target_package + target_package_name = target_package.name if target_package else "unknown" + intel = message.info.intel + + descriptions: list[tuple[str, str]] = [] + if intel: + a_intel = intel[0] + if a_intel.ghsa: + cve_text = a_intel.ghsa.description or a_intel.ghsa.summary or "" + if cve_text: + descriptions.append(("ghsa", cve_text)) + if a_intel.ubuntu and a_intel.ubuntu.description: + descriptions.append(("ubuntu", a_intel.ubuntu.description)) + + version = (target_package.version or "") if target_package else "" + release = (target_package.release or "") if target_package else "" + target_nvr = f"{target_package_name}-{version}-{release}" if target_package_name else "" + + policy_context = _format_policy_context_for_report( + target_nvr=target_nvr, + identify_result=ctx.identify_result, + intel=intel, + ) + + llm = await builder.get_llm(llm_name=config.llm_name, wrapper_type=LLMFrameworkEnum.LANGCHAIN) + + with tracer.push_active_function("generate_code_agent_report", input_data={"vuln_id": vuln_id}): + code_agent_report: CodeAgentReport = await generate_code_agent_report( + llm=llm, + vuln_id=vuln_id, + target_package=target_package_name, + descriptions=descriptions, + downstream_report=downstream_report, + upstream_report=upstream_report, + l1_agent_answer=l1_result.l1_agent_answer, + tracer=tracer, + policy_context=policy_context, + l2_result=l2_result, + ) + + source_key = ctx.source_key + if source_key: + report_dir = Path(config.base_checker_dir) / source_key / "report" + report_dir.mkdir(parents=True, exist_ok=True) + suffix = f"-{target_package_name}" if target_package_name else "" + if version: + suffix += f"-{version}" + if release: + suffix += f"-{release}" + report_path = report_dir / f"L1_report_{vuln_id}{suffix}.md" + report_path.write_text(code_agent_report.to_markdown( + vuln_id=vuln_id, + target_package=target_package_name, + version=version, + release=release, + downstream_report=downstream_report, + )) + logger.info("cve_checker_report: wrote report to %s", report_path) + + with tracer.push_active_function( + "report_finish", + input_data={ + "justification_label": code_agent_report.justification_label, + "has_l2_override": l2_result is not None and l2_result.l2_override_verdict is not None, + }, + ) as span: + span.set_output({ + "executive_summary": code_agent_report.executive_summary, + "affected_files": code_agent_report.affected_files, + }) + intel_score = intel[0].intel_score + + cve_description = "" + if descriptions: + cve_description = descriptions[0][1] + + return AgentMorpheusOutput( + input=message.input, + info=message.info, + output=OutputPayload( + analysis=_build_analysis( + message, + code_agent_report, + intel_score, + cve_description=cve_description, + downstream_report=downstream_report, + upstream_report=upstream_report, + l1_result=l1_result, + l2_result=l2_result, + ), + vex=None, + ), + ) + + yield FunctionInfo.from_fn( + _arun, + description="Generate final checker report from L1/L2 investigation results", + ) diff --git a/src/vuln_analysis/functions/cve_checker_segmentation.py b/src/vuln_analysis/functions/cve_checker_segmentation.py new file mode 100644 index 000000000..bae7f31ee --- /dev/null +++ b/src/vuln_analysis/functions/cve_checker_segmentation.py @@ -0,0 +1,170 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +from pathlib import Path + +from aiq.builder.builder import Builder +from aiq.builder.framework_enum import LLMFrameworkEnum +from aiq.builder.function_info import FunctionInfo +from aiq.cli.register_workflow import register_function +from aiq.data_models.function import FunctionBaseConfig +from pydantic import Field + +from exploit_iq_commons.logging.loggers_factory import LoggingFactory +from langchain.docstore.document import Document +from exploit_iq_commons.utils.document_embedding import MultiLanguageRecursiveCharacterTextSplitter,ExtendedLanguageParser +from langchain_community.document_loaders.generic import GenericLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +logger = LoggingFactory.get_agent_logger(__name__) + +_BUILD_FILE_NAMES = {"Makefile", "GNUmakefile", "configure"} +LANG_PARSER_EXTENSIONS = {".c", ".h", ".cpp", ".hpp", ".py", ".go", ".java"} +TEXT_FILE_EXTENSIONS = {".spec", ".conf", ".cfg", ".sh", ".m4",".ac", ".am", ".in", ".txt", ".md", ".rst"} +class RpmDocumentEmbedding: + def __init__(self, source_dir: Path, chunk_size: int = 800, chunk_overlap: int = 160): + self.source_dir = source_dir + self.lang_splitter = MultiLanguageRecursiveCharacterTextSplitter( + chunk_size=chunk_size, chunk_overlap=chunk_overlap, + ) + self.text_splitter = RecursiveCharacterTextSplitter( + chunk_size=1000, chunk_overlap=200, + ) + + def load_and_chunk_code(self) -> list[Document]: + loader = GenericLoader.from_filesystem( + self.source_dir, + glob="**/*", + suffixes=list(LANG_PARSER_EXTENSIONS), + parser=ExtendedLanguageParser(), + ) + try: + documents = loader.load() + except Exception as e: + logger.warning("LanguageParser failed on %s: %s", self.source_dir, e) + return [] + return self.lang_splitter.split_documents(documents) + + def load_and_chunk_files(self) -> list[Document]: + documents: list[Document] = [] + for root, _, files in os.walk(self.source_dir): + for file in files: + if any(file.endswith(ext) for ext in TEXT_FILE_EXTENSIONS) or file in _BUILD_FILE_NAMES: + file_path = os.path.join(root, file) + try: + with open(file_path, "r") as f: + content = f.read() + documents.append(Document(page_content=content, metadata={"source": file_path})) + except Exception as e: + logger.warning("Error reading %s: %s", file_path, e) + continue + return self.text_splitter.split_documents(documents) + + def load_and_chunk_all(self) -> list[Document]: + documents = self.load_and_chunk_code() + documents.extend(self.load_and_chunk_files()) + return documents + + + + +class CVECheckerSegmentationConfig(FunctionBaseConfig, name="cve_checker_segmentation"): + """ + Builds a scoped Tantivy lexical code index from extracted RPM source files. + Reads source directories populated by source_acquisition, indexes them, + and sets info.vdb.code_index_path for downstream checker nodes. + """ + base_checker_dir: str = Field( + default=".cache/am_cache/checker", + description="Root directory for checker-specific artifacts.", + ) + base_code_index_dir: str = Field( + default=".cache/am_cache/code_index", + description="Base directory for Tantivy code index storage.", + ) + include_extensions: list[str] = Field( + default=[ + ".c", ".h", ".cpp", ".hpp", ".py", ".go", ".java", + ".spec", ".patch", ".conf", ".cfg", ".sh", ".m4", + ".ac", ".am", ".in", ".txt", ".md", ".rst", + ], + description="File extensions to include when building the code index.", + ) + + +@register_function(config_type=CVECheckerSegmentationConfig, framework_wrappers=[LLMFrameworkEnum.LANGCHAIN]) +async def cve_checker_segmentation(config: CVECheckerSegmentationConfig, builder: Builder): + from exploit_iq_commons.data_models.info import AgentMorpheusInfo + from exploit_iq_commons.data_models.input import AgentMorpheusEngineInput + from vuln_analysis.utils.full_text_search import FullTextSearch + + async def _arun(message: AgentMorpheusEngineInput) -> AgentMorpheusEngineInput: + if not message.info.checker_context or not message.info.checker_context.source_key: + logger.info("checker_segmentation: no checker_context.source_keys, skipping indexing") + return message + + source_key = message.info.checker_context.source_key + if not source_key: + logger.info("checker_segmentation: no source_key, skipping indexing") + return message + + index_path = FullTextSearch.get_index_directory(config.base_code_index_dir, source_key) + + if index_path.exists(): + logger.info("checker_segmentation: cache hit on code index: %s", index_path) + else: + start = time.time() + fts = FullTextSearch(cache_path=str(index_path)) + + source_dir = Path(config.base_checker_dir) / source_key / "source" + if not source_dir.is_dir(): + logger.warning("checker_segmentation: source dir missing: %s", source_dir) + return message + + logger.info("checker_segmentation: indexing source dir %s", source_dir) + document_embedding = RpmDocumentEmbedding(source_dir=source_dir) + documents = document_embedding.load_and_chunk_all() + + + fts.add_documents_from_langchain_chunks(documents) + + elapsed = time.time() - start + logger.info("checker_segmentation: indexing completed in %.2fs at %s", elapsed, index_path) + + message.info.vdb = AgentMorpheusInfo.VdbPaths(code_index_path=str(index_path)) + return message + + yield FunctionInfo.from_fn( + _arun, + description="Build scoped Tantivy code index from extracted checker sources", + ) + + +def _index_build_files(fts, source_dir: Path) -> None: + """Walk source_dir for extensionless build files and add them to the index.""" + docs: list[tuple[str, str]] = [] + for root, _, files in os.walk(source_dir): + for fname in files: + if fname in _BUILD_FILE_NAMES: + fpath = os.path.join(root, fname) + try: + with open(fpath, "r", encoding="utf-8", errors="replace") as f: + docs.append((fpath, f.read())) + except Exception as exc: + logger.warning("checker_segmentation: error reading %s: %s", fpath, exc) + if docs: + fts.add_documents(docs) + logger.info("checker_segmentation: indexed %d build files from %s", len(docs), source_dir) diff --git a/src/vuln_analysis/functions/cve_http_output.py b/src/vuln_analysis/functions/cve_http_output.py index b6b475645..ceb2539bf 100644 --- a/src/vuln_analysis/functions/cve_http_output.py +++ b/src/vuln_analysis/functions/cve_http_output.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import base64 +from dataclasses import dataclass from http import HTTPStatus from datetime import datetime from aiq.builder.builder import Builder @@ -22,13 +23,21 @@ from pydantic import Field from exploit_iq_commons.logging.loggers_factory import LoggingFactory, trace_id +from exploit_iq_commons.data_models.checker_status import ( + CHECKER_FAILURE_ERROR_TYPES, + PACKAGE_CHECKER_STATUS_DESCRIPTIONS, +) +from exploit_iq_commons.data_models.common import PipelineMode, TypedBaseModel +from exploit_iq_commons.data_models.input import SourceDocumentsInfo from vuln_analysis.data_models.job import Job, LocalDateTime -from exploit_iq_commons.data_models.common import TypedBaseModel import typing -from typing import Any +from typing import Any, TYPE_CHECKING import os import re +if TYPE_CHECKING: + from vuln_analysis.data_models.output import AgentMorpheusOutput, FailureReport + logger = LoggingFactory.get_agent_logger(__name__) @@ -91,43 +100,104 @@ class CVEHttpOutputConfig(FunctionBaseConfig, name="cve_http_output"): mlops_config: MLOpsConfig = Field(..., description="MLOps configuration") +@dataclass +class OutputPayload: + """Encapsulates the HTTP output payload details.""" + json: str + url: str + skip_mlops: bool + + +def _build_output_payload( + message: "AgentMorpheusOutput", + config: CVEHttpOutputConfig, + default_json: str, +) -> OutputPayload: + """ + Determine the payload to send - either the full output or a failure report. + + Returns an OutputPayload with the appropriate JSON, URL, and skip_mlops flag. + """ + from vuln_analysis.data_models.output import FailureReport + + default_url = config.url + config.endpoint + failure_url = config.url + config.failure_endpoint + + if message.input.code_index_success is False: + repo_url = message.input.image.source_info[0].git_repo if message.input.image.source_info else "unknown" + report = FailureReport( + scan_id=message.input.scan.id, + error_type="processing-error", + error_message=f"Failed to clone repository {repo_url}--{message.input.failure_reason}", + ) + logger.info(f"Code index failed for scan {message.input.scan.id}, sending failure report to {failure_url}") + return OutputPayload(json=report.model_dump_json(by_alias=True), url=failure_url, skip_mlops=True) + + checker_ctx = message.info.checker_context + if checker_ctx and checker_ctx.status in CHECKER_FAILURE_ERROR_TYPES: + error_type = CHECKER_FAILURE_ERROR_TYPES[checker_ctx.status] + error_msg = PACKAGE_CHECKER_STATUS_DESCRIPTIONS.get( + checker_ctx.status, + f"Checker failed with status {checker_ctx.status}" + ) + cve_id = message.input.scan.vulns[0].vuln_id if message.input.scan.vulns else "unknown" + pkg_name = message.input.image.target_package.name if message.input.image.target_package else "unknown" + report = FailureReport( + scan_id=message.input.scan.id, + error_type=error_type, + error_message=f"{error_msg} (CVE: {cve_id}, package: {pkg_name})", + ) + logger.info( + f"Checker early exit for scan {message.input.scan.id} with status {checker_ctx.status}, " + f"sending failure report to {failure_url}" + ) + return OutputPayload(json=report.model_dump_json(by_alias=True), url=failure_url, skip_mlops=True) + + return OutputPayload(json=default_json, url=default_url, skip_mlops=False) + + @register_function(config_type=CVEHttpOutputConfig) async def output_to_http(config: CVEHttpOutputConfig, builder: Builder): # pylint: disable=unused-argument - from vuln_analysis.data_models.output import AgentMorpheusOutput, FailureReport + from vuln_analysis.data_models.output import AgentMorpheusOutput from vuln_analysis.utils import http_utils async def _arun(message: AgentMorpheusOutput) -> AgentMorpheusOutput: trace_id.set(message.input.scan.id) + + model_json = message.model_dump_json(by_alias=True) - url = config.url + config.endpoint + + # Save JSON for debugging - compare with local markdown reports + #from pathlib import Path + #debug_output_dir = Path(".cache/am_cache/checker_json_output") + #debug_output_dir.mkdir(parents=True, exist_ok=True) + #vuln_id = message.input.scan.vulns[0].vuln_id if message.input.scan.vulns else "unknown" + #json_file = debug_output_dir / f"{message.input.scan.id}_{vuln_id}.json" + #json_file.write_text(model_json) + #logger.info(f"Saved JSON output to {json_file}") + headers = {'Content-type': 'application/json', 'traceId': trace_id.get()} auth_header = get_auth_header(config) if auth_header is not None: headers['Authorization'] = auth_header - verify = True - if config.verify_path: - verify = config.verify_path + verify = config.verify_path if config.verify_path else True + + payload = _build_output_payload(message, config, model_json) try: - skipped_mlops = False - if message.input.code_index_success is False: - repo_url = message.input.image.source_info[0].git_repo if message.input.image.source_info else "unknown" - failure_report = FailureReport( - scan_id=message.input.scan.id, - error_type="processing-error", - error_message=f"Failed to clone repository {repo_url}--{message.input.failure_reason}", - ) - failure_url = config.url + config.failure_endpoint - logger.info(f"Code index failed for scan {message.input.scan.id}, sending failure report to {failure_url}") - model_json = failure_report.model_dump_json(by_alias=True) - url = failure_url - skipped_mlops = True - logger.info(f"Sending output to {url}") - http_utils.request_with_retry(request_kwargs={ - "url": url, "method": "POST", "data": model_json.encode('utf-8'), "headers": headers, "verify": verify - }, accept_status_codes=(HTTPStatus.OK, HTTPStatus.CREATED, HTTPStatus.ACCEPTED)) - if config.enable_mlops and not skipped_mlops: + logger.info(f"Sending output to {payload.url}") + http_utils.request_with_retry( + request_kwargs={ + "url": payload.url, + "method": "POST", + "data": payload.json.encode('utf-8'), + "headers": headers, + "verify": verify, + }, + accept_status_codes=(HTTPStatus.OK, HTTPStatus.CREATED, HTTPStatus.ACCEPTED), + ) + if config.enable_mlops and not payload.skip_mlops: mlops_url = None try: job = _extract_job_data(message) @@ -143,9 +213,9 @@ async def _arun(message: AgentMorpheusOutput) -> AgentMorpheusOutput: except Exception as mlops_e: logger.error('Unable to send job to MLOps API at %s. Error: %s', mlops_url, mlops_e) except Exception as e: - logger.error('Unable to send output response to %s. Error: %s', url, e) + logger.error('Unable to send output response to %s. Error: %s', payload.url, e) else: - logger.info('Successfully sent output to %s', url) + logger.info('Successfully sent output to %s', payload.url) return message diff --git a/src/vuln_analysis/functions/cve_package_code_agent.py b/src/vuln_analysis/functions/cve_package_code_agent.py new file mode 100644 index 000000000..0df0b90d0 --- /dev/null +++ b/src/vuln_analysis/functions/cve_package_code_agent.py @@ -0,0 +1,960 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from pathlib import Path +from typing import Literal + +from aiq.builder.builder import Builder +from aiq.builder.framework_enum import LLMFrameworkEnum +from aiq.builder.function_info import FunctionInfo +from aiq.cli.register_workflow import register_function +from aiq.data_models.function import FunctionBaseConfig +from pydantic import Field + +from exploit_iq_commons.logging.loggers_factory import LoggingFactory, trace_id +from exploit_iq_commons.data_models.checker_status import L1InvestigationResult + +from langgraph.graph import StateGraph, START, END +from langgraph.prebuilt import ToolNode +from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, RemoveMessage + +from nat.builder.context import Context +from exploit_iq_commons.data_models.input import AgentMorpheusEngineInput +from vuln_analysis.functions.code_agent_graph_defs import ( + CodeAgentState, + DownstreamSearchReport, + UpstreamSearchReport, + ParsedPatch, + downstream_search_preprocss, + upstream_search_preprocess, + extract_l1_verdict, + VulnerabilityIntel, + format_patch_data_for_intel, + get_relevant_hunks, +) +from vuln_analysis.utils.rpm_checker_prompts import ( + L1_AGENT_SYS_PROMPT_PATCH_AVAILABLE, + L1_AGENT_SYS_PROMPT_UPSTREAM_PATCH, + L1_AGENT_SYS_PROMPT_REBASE_FIX, + L1_AGENT_SYS_PROMPT_REBASE_NO_PATCH, + L1_AGENT_PROMPT_TEMPLATE, + L1_AGENT_PROMPT_TEMPLATE_NO_PATCH, + L1_AGENT_THOUGHT_INSTRUCTIONS, + L1_AGENT_THOUGHT_UPSTREAM_INSTRUCTIONS, + L1_AGENT_THOUGHT_REBASE_INSTRUCTIONS, + L1_AGENT_THOUGHT_CVE_DESC_INSTRUCTIONS, + L1_COMPREHENSION_PROMPT, + L1_MEMORY_UPDATE_PROMPT, + VULNERABILITY_INTEL_EXTRACTION_PROMPT, +) +from vuln_analysis.tools.brew_downloader import BrewDownloader, BrewDownloaderError, resolve_brew_profile +from vuln_analysis.utils.package_identifier import _extract_dist_tag + +from vuln_analysis.functions.react_internals import CheckerThought, CodeFindings, Observation, FORCED_FINISH_PROMPT, check_empty_output +from vuln_analysis.utils.intel_utils import extract_commit_url_candidates +from vuln_analysis.utils.vulnerability_intel_sanitizer import VulnerabilityIntelSanitizer +from vuln_analysis.utils.token_utils import truncate_tool_output +from vuln_analysis.runtime_context import ctx_state + +logger = LoggingFactory.get_agent_logger(__name__) + +import uuid +import tiktoken + +# NEVRA = Name-Epoch-Version-Release-Architecture, the standard RPM package naming format. +_RPM_NEVRA_RE = re.compile(r"^(.+?)-(?:(\d+):)?(\d\S*?)-(\S+)$") + +# Architecture mappings for early CVE applicability checks +ARCH_TO_BITNESS = { + "x86_64": "64-bit", + "amd64": "64-bit", + "i686": "32-bit", + "i386": "32-bit", + "aarch64": "64-bit", + "arm64": "64-bit", + "ppc64le": "64-bit", + "s390x": "64-bit", +} +ARCH_TO_FAMILY = { + "x86_64": "x86", + "amd64": "x86", + "i686": "x86", + "i386": "x86", + "aarch64": "arm", + "arm64": "arm", + "ppc64le": "ppc", + "s390x": "s390", +} + + +def _parse_fix_info_from_context(ctx, target_name: str, target_release: str | None = None) -> dict: + """Extract {name, version, release} from checker_context.identify_result.fixed_rpm_list. + + Handles both epoch and non-epoch NEVRAs: + - With epoch: libpq-0:13.20-1.el8_6.x86_64 + - Without epoch: libpq-13.20-1.el8_6.x86_64 + + Finds the NEVRA that matches the target package name AND EL dist tag. + Falls back to the first name match if no EL-matching entry is found. + Returns an empty dict if no match is found. + """ + if not ctx or not ctx.identify_result or not ctx.identify_result.fixed_rpm_list: + return {} + + target_dist = _extract_dist_tag(target_release) if target_release else None + fallback: dict = {} + + for nevra in ctx.identify_result.fixed_rpm_list: + m = _RPM_NEVRA_RE.match(nevra) + if not m: + continue + name = m.group(1) + if name.lower() != target_name.lower(): + continue + + version = m.group(3) + release_arch = m.group(4) + release = release_arch.rsplit(".", 1)[0] if "." in release_arch else release_arch + clean_nevra = f"{name}-{version}-{release_arch}" + result = {"nevra": clean_nevra, "name": name, "version": version, "release": release} + + # Store first name match as fallback + if not fallback: + fallback = result + + # Check if EL tag matches - if so, return immediately + candidate_dist = _extract_dist_tag(release_arch) + if target_dist and candidate_dist and target_dist == candidate_dist: + return result + + # No EL-matching entry found, return fallback (first name match) + return fallback + + +def _build_tool_strategy(tool_names: list[str]) -> str: + """Generate tool usage guidance based on available tools.""" + strategies = [] + tool_names_lower = [t.lower().replace("_", " ") for t in tool_names] + + if any("grep" in t for t in tool_names_lower): + strategies.append("- Use Source Grep for exact code patterns from patch (function names, variable names, specific code)") + if any("keyword" in t or "search" in t for t in tool_names_lower): + strategies.append("- Use Code Keyword Search for broader concept searches when grep fails") + if any("read" in t for t in tool_names_lower): + strategies.append("- Use Read File to examine full context around matches") + + return "\n".join(strategies) if strategies else "Use available tools to search for vulnerable and fixed code patterns." + + +# --------------------------------------------------------------------------- +# Policy context formatting for L1 reports (Feedback-2 gap coverage) +# --------------------------------------------------------------------------- + +_POLICY_MAX_RPM_LIST_ITEMS = 5 +_POLICY_RHSA_STATEMENT_CAP = 400 +_POLICY_MAX_PACKAGE_STATE_ITEMS = 8 + + +def _format_policy_context_for_l1_report( + *, + target_nvr: str, + identify_result, + intel, +) -> str: + """Build a context block for the LLM prompt covering NVR posture and RHSA excerpts. + + Returns an empty string if no meaningful context is available. + """ + lines: list[str] = [] + + # 1. Scanned target NVR + if target_nvr: + lines.append(f"**Scanned target NVR:** `{target_nvr}`") + + # 2. PackageIdentifyResult: affected/fixed lists + if identify_result: + affected = identify_result.affected_rpm_list or [] + fixed = identify_result.fixed_rpm_list or [] + + if affected: + shown = affected[:_POLICY_MAX_RPM_LIST_ITEMS] + suffix = f" (+ {len(affected) - len(shown)} more)" if len(affected) > len(shown) else "" + lines.append(f"**Affected NVRs from identify:** {', '.join(f'`{n}`' for n in shown)}{suffix}") + lines.append(f" - is_target_package_affected: `{identify_result.is_target_package_affected.value}`") + + if fixed: + shown = fixed[:_POLICY_MAX_RPM_LIST_ITEMS] + suffix = f" (+ {len(fixed) - len(shown)} more)" if len(fixed) > len(shown) else "" + lines.append(f"**Fixed NVRs from identify:** {', '.join(f'`{n}`' for n in shown)}{suffix}") + lines.append(f" - is_target_package_fixed: `{identify_result.is_target_package_fixed.value}`") + + # 3. RHSA excerpts (if present) + rhsa = None + if intel and len(intel) > 0: + rhsa = intel[0].rhsa + + if rhsa: + # Statement excerpt + if rhsa.statement: + stmt = rhsa.statement + if len(stmt) > _POLICY_RHSA_STATEMENT_CAP: + stmt = stmt[:_POLICY_RHSA_STATEMENT_CAP] + " …" + lines.append(f"**RHSA statement excerpt:** {stmt}") + + # Upstream fix + if rhsa.upstream_fix: + lines.append(f"**RHSA upstream_fix:** `{rhsa.upstream_fix}`") + + # Package state (compact table-like bullets) + pkg_states = rhsa.package_state or [] + if pkg_states: + lines.append("**RHSA package_state:**") + for ps in pkg_states[:_POLICY_MAX_PACKAGE_STATE_ITEMS]: + parts = [] + if ps.product_name: + parts.append(ps.product_name) + if ps.package_name: + parts.append(f"pkg={ps.package_name}") + if ps.fix_state: + parts.append(f"fix_state={ps.fix_state}") + if parts: + lines.append(f" - {' | '.join(parts)}") + if len(pkg_states) > _POLICY_MAX_PACKAGE_STATE_ITEMS: + lines.append(f" - (+ {len(pkg_states) - _POLICY_MAX_PACKAGE_STATE_ITEMS} more)") + + return "\n".join(lines) + + +class CVEPackageCodeAgentConfig(FunctionBaseConfig, name="cve_package_code_agent"): + """ + Level 1 Package Code Agent. Investigates each CVE using extracted source + code and the scoped Tantivy code index built by checker_segmentation. + + Phases: Identify -> Locate -> Verify (see HLD-standalone-checker.md §5). + """ + base_checker_dir: str = Field( + default=".cache/am_cache/checker", + description="Root directory for checker-specific artifacts.", + ) + base_code_index_dir: str = Field( + default=".cache/am_cache/code_index", + description="Base directory for Tantivy code index storage.", + ) + base_rpm_dir: str = Field( + default=".cache/am_cache/rpms", + description="Shared RPM cache directory (for BrewDownloader).", + ) + rpm_user_type: str = Field( + default="internal", + description=( + "Brew profile for reference-patch SRPM lookup: internal or external. " + "Overridden by RPM_USER_TYPE environment variable when set." + ), + ) + llm_name: str = Field(description="The LLM model to use with the L1 code agent.") + tool_names: list[str] = Field(default=[], description="The list of tools to provide to L1 code agent") + max_iterations: int = Field(default=10, description="The maximum number of iterations for the agent.") + context_window_token_limit: int = Field(default=5000, description="Token limit for context window before pruning old messages.") + + +async def create_graph_code_agent(config: CVEPackageCodeAgentConfig, builder: Builder, state: AgentMorpheusEngineInput, tracer): + # Node name constants + THOUGHT_NODE = "think_node" + TOOL_NODE = "tool" + FORCED_FINISH_NODE = "forced_finish" + OBSERVATION_NODE = "observation_node" + DOWNSTREAM_SEARCH_NODE = "downstream_search" + GATHER_MORE_INFO_NODE = "gather_more_info" + L1_AGENT_NODE = "L1_agent" + + llm = await builder.get_llm(llm_name=config.llm_name, wrapper_type=LLMFrameworkEnum.LANGCHAIN) + tools = builder.get_tools(tool_names=config.tool_names, wrapper_type=LLMFrameworkEnum.LANGCHAIN) + + thought_llm = llm.with_structured_output(CheckerThought) + comprehension_llm = llm.with_structured_output(CodeFindings) + observation_llm = llm.with_structured_output(Observation) + vulnerability_intel_llm = llm.with_structured_output(VulnerabilityIntel) + # Get tool names after filtering for dynamic guidance + enabled_tool_names = [tool.name for tool in tools] + tool_descriptions_list = [t.name + ": " + t.description for t in tools] + tools_node = ToolNode(tools, handle_tool_errors=True) + tool_strategy = _build_tool_strategy(enabled_tool_names) + tools_str = "\n".join(tool_descriptions_list) + + vuln_id = state.input.scan.vulns[0].vuln_id + ctx = state.info.checker_context + intel = state.info.intel + target_package = state.input.image.target_package + source_key = ctx.source_key + + _tiktoken_enc = tiktoken.get_encoding("cl100k_base") + + def _count_tokens(text: str) -> int: + """Count tokens using tiktoken cl100k_base encoding (~90-95% accurate for Llama 3.1).""" + try: + return len(_tiktoken_enc.encode(text)) + except Exception: + return len(text) // 4 + + def _estimate_tokens(runtime_prompt: str, messages: list, observation: Observation | None) -> int: + """Estimate the token count thought_node will send to the LLM.""" + parts = [runtime_prompt] + for msg in messages: + if hasattr(msg, "content") and isinstance(msg.content, str): + parts.append(msg.content) + if observation is not None: + for item in (observation.memory or []): + parts.append(item) + for item in (observation.results or []): + parts.append(item) + return _count_tokens("\n".join(parts)) + # -- Locate setup: fix info + BrewDownloader + paths ----------------------- + aIntel = intel[0] + fix_info = _parse_fix_info_from_context(ctx, target_package.name, target_package.release) + checker_dir = Path(config.base_checker_dir) / source_key + source_dir = checker_dir / "source" + patch_dir = checker_dir / "patch" + + brew_downloader = None + if fix_info: + try: + brew_downloader = BrewDownloader( + resolve_brew_profile(config.rpm_user_type), + config.base_rpm_dir, + str(checker_dir), + ) + brew_downloader.connect() + except BrewDownloaderError as e: + logger.warning("locate: BrewDownloader init failed (%s), diff path unavailable", e) + brew_downloader = None + + descriptions: list[tuple[str, str]] = [] + if aIntel.ghsa: + cve_text = aIntel.ghsa.description or aIntel.ghsa.summary or "" + if cve_text: + descriptions.append(("ghsa", cve_text)) + if aIntel.ubuntu and aIntel.ubuntu.description: + descriptions.append(("ubuntu", aIntel.ubuntu.description)) + + cve_description = "\n".join(f"[{src}] {txt}" for src, txt in descriptions) + + + async def L1_agent(state: CodeAgentState) -> dict: + logger.info("L1_agent: starting") + downstream_report = state.get("downstream_report") + upstream_report = state.get("upstream_report") + # Extract potential commit URLs from intel references for analysis + commit_url_candidates = extract_commit_url_candidates(aIntel) + with tracer.push_active_function("Initial_Intelligence_Gathering", input_data={"commit_url_candidates": commit_url_candidates}) as span: + + if downstream_report and downstream_report.is_patch_file_available: + parsed_patch = downstream_report.parsed_patch + patch_data = format_patch_data_for_intel(parsed_patch) + elif upstream_report and upstream_report.is_fixed_srpm_is_needed: + parsed_patch = upstream_report.fixed_parsed_patch + patch_data = format_patch_data_for_intel(parsed_patch) + else: + parsed_patch = None + patch_data = "" + + # Extract vendor mitigations from intel (RHSA, etc.) + vendor_mitigations = "" + if aIntel and aIntel.rhsa: + mitigation = getattr(aIntel.rhsa, 'mitigation', None) + if mitigation: + mit_text = mitigation.get('value', '') if isinstance(mitigation, dict) else str(mitigation) + if mit_text: + vendor_mitigations = mit_text + + vul_prompt = VULNERABILITY_INTEL_EXTRACTION_PROMPT.format( + vuln_id=vuln_id, + target_package=target_package.name, + cve_description=cve_description, + vendor_mitigations=vendor_mitigations or "No vendor mitigations available.", + patch_data=patch_data, + ) + vulnerability_intel: VulnerabilityIntel = await vulnerability_intel_llm.ainvoke( + [SystemMessage(content=vul_prompt)], + ) + vulnerability_intel = VulnerabilityIntelSanitizer(parsed_patch).apply( + vulnerability_intel + ) + # Preserve vendor mitigations in the intel object if not already extracted + if vendor_mitigations and not vulnerability_intel.known_mitigations: + vulnerability_intel.known_mitigations = vendor_mitigations + + if downstream_report: + vulnerability_intel.is_downstream_patch_available = downstream_report.is_patch_file_available + vulnerability_intel.is_patch_applied_in_build = downstream_report.is_patch_applied_in_build + vulnerability_intel.patch_file_name = downstream_report.patch_file_name or "" + + # Early architecture check - skip further investigation if CVE doesn't apply + target_arch = target_package.arch + target_bitness = ARCH_TO_BITNESS.get(target_arch) + target_family = ARCH_TO_FAMILY.get(target_arch) + + arch_mismatch_reason = None + if target_bitness and vulnerability_intel.affected_bitness != "both": + if target_bitness != vulnerability_intel.affected_bitness: + arch_mismatch_reason = ( + f"Bitness mismatch: CVE affects {vulnerability_intel.affected_bitness}, " + f"target is {target_bitness} ({target_arch})" + ) + + if not arch_mismatch_reason and target_family: + if vulnerability_intel.affected_architectures is not None: + if target_family not in vulnerability_intel.affected_architectures: + arch_mismatch_reason = ( + f"Architecture mismatch: CVE affects {vulnerability_intel.affected_architectures}, " + f"target is {target_family} ({target_arch})" + ) + + if arch_mismatch_reason: + logger.info( + "L1_agent: %s - %s. Skipping further investigation.", + vuln_id, arch_mismatch_reason + ) + span.set_output({ + "vulnerability_intel": vulnerability_intel.model_dump(), + "arch_mismatch_reason": arch_mismatch_reason, + }) + return { + "vulnerability_intel": vulnerability_intel, + "arch_mismatch_reason": arch_mismatch_reason, + "runtime_prompt": "", + "messages": [], + } + + span.set_output({ + "vulnerability_intel": vulnerability_intel.model_dump(), + }) + + # Use case 1: Downstream patch file is available + if downstream_report and downstream_report.is_patch_file_available: + runtime_prompt = L1_AGENT_PROMPT_TEMPLATE.format( + sys_prompt=L1_AGENT_SYS_PROMPT_PATCH_AVAILABLE, + vuln_id=vuln_id, + target_package=target_package.name, + vulnerability_intel=vulnerability_intel.format_for_prompt(), + tools=tools_str, + tool_selection_strategy=tool_strategy, + tool_instructions=L1_AGENT_THOUGHT_INSTRUCTIONS, + ) + + span.set_output({ + "mode": "patch_available", + "patch_filename": downstream_report.patch_file_name, + }) + # Use case 2: code is fixed by rebase + elif upstream_report and upstream_report.is_code_fixed_by_rebase == "yes": + + if upstream_report.is_fixed_srpm_is_needed and upstream_report.fixed_parsed_patch: + # Has patch context - use patch-based verification + runtime_prompt = L1_AGENT_PROMPT_TEMPLATE.format( + sys_prompt=L1_AGENT_SYS_PROMPT_REBASE_FIX, + vuln_id=vuln_id, + target_package=target_package.name, + vulnerability_intel=vulnerability_intel.format_for_prompt(), + tools=tools_str, + tool_selection_strategy=tool_strategy, + tool_instructions=L1_AGENT_THOUGHT_REBASE_INSTRUCTIONS, + ) + + span.set_output({ + "mode": "rebase_fix_verification", + "spec_log_change": upstream_report.spec_file_log_change[:200] if upstream_report.spec_file_log_change else "", + }) + else: + # No patch context - use CVE description-based verification + runtime_prompt = L1_AGENT_PROMPT_TEMPLATE_NO_PATCH.format( + sys_prompt=L1_AGENT_SYS_PROMPT_REBASE_NO_PATCH, + vuln_id=vuln_id, + target_package=target_package.name, + vulnerability_intel=vulnerability_intel.format_for_prompt(), + tools=tools_str, + tool_selection_strategy=tool_strategy, + tool_instructions=L1_AGENT_THOUGHT_CVE_DESC_INSTRUCTIONS, + ) + + span.set_output({ + "mode": "rebase_fix_cve_description", + "spec_log_change": upstream_report.spec_file_log_change[:200] if upstream_report.spec_file_log_change else "", + }) + # use case 3: in target patch was not found but patch is found in the rpm that was mention in cve that is fixed + elif upstream_report and upstream_report.fixed_parsed_patch: + runtime_prompt = L1_AGENT_PROMPT_TEMPLATE.format( + sys_prompt=L1_AGENT_SYS_PROMPT_UPSTREAM_PATCH, + vuln_id=vuln_id, + target_package=target_package.name, + vulnerability_intel=vulnerability_intel.format_for_prompt(), + tools=tools_str, + tool_selection_strategy=tool_strategy, + tool_instructions=L1_AGENT_THOUGHT_UPSTREAM_INSTRUCTIONS, + ) + + span.set_output({ + "mode": "upstream_patch_verification", + "patch_filename": upstream_report.fixed_srpm_file_name, + }) + else: + # Use case 4: Default prompt - no patch context, use VulnerabilityIntel from CVE description + runtime_prompt = L1_AGENT_PROMPT_TEMPLATE_NO_PATCH.format( + sys_prompt=L1_AGENT_SYS_PROMPT_REBASE_NO_PATCH, + vuln_id=vuln_id, + target_package=target_package.name, + vulnerability_intel=vulnerability_intel.format_for_prompt(), + tools=tools_str, + tool_selection_strategy=tool_strategy, + tool_instructions=L1_AGENT_THOUGHT_CVE_DESC_INSTRUCTIONS, + ) + span.set_output({ + "mode": "no_patch", + }) + + messages = state.get("messages", []) + remove_messages = [RemoveMessage(id=msg.id) for msg in messages if msg.id] + + return { + "runtime_prompt": runtime_prompt, + "vulnerability_intel": vulnerability_intel, + "messages": remove_messages, + } + + async def should_continue_downstream(state: CodeAgentState) -> str: + downstream_report = state.get("downstream_report") + if downstream_report and downstream_report.is_patch_file_available: + return "L1_agent" + else: + return "gather_more_info" + + async def downstream_search(state: CodeAgentState) -> dict: + logger.info("downstream_search: starting") + + + build_log = ctx.artifacts.build_log_path if ctx and ctx.artifacts else None + with tracer.push_active_function("downstream_search", input_data={}) as span: + report: DownstreamSearchReport = await downstream_search_preprocss( + llm=llm, + vuln_id=vuln_id, + descriptions=descriptions, + source_path=Path(source_dir), + build_log_path=Path(build_log) if build_log else None, + tracer=tracer, + ) + span.set_output({ + "is_patch_file_available": report.is_patch_file_available, + "is_patch_in_spec_file": report.is_patch_in_spec_file, + "spec_file_log_change": report.spec_file_log_change, + "is_patch_applied_in_build": report.is_patch_applied_in_build, + "build_log_patch_applied": report.build_log_patch_applied, + "parsed_patch": report.parsed_patch.patch_filename if report.parsed_patch else None, + }) + + return { + "downstream_report": report, + "messages": [AIMessage(content="Downstream flow preprocess completed")], + } + + + async def gather_more_info(state: CodeAgentState) -> dict: + logger.info("gather_more_info: starting") + # Extract commit URL candidates from intel references for patch fetching + candidates = extract_commit_url_candidates(aIntel) + with tracer.push_active_function("gather_more_info", input_data={}) as span: + report: UpstreamSearchReport = await upstream_search_preprocess( + vuln_id=vuln_id, + fix_info=fix_info, + brew_downloader=brew_downloader, + patch_dir=Path(patch_dir), + source_path=Path(source_dir), + target_package=target_package, + tracer=tracer, + intel=intel, + commit_url_candidates=candidates, + cve_description=cve_description, + llm=llm, + ) + + + + span.set_output({ + "is_fixed_srpm_is_needed": report.is_fixed_srpm_is_needed, + "is_rebase_fix": report.is_code_fixed_by_rebase == "yes", + }) + return { + "messages": [AIMessage(content="Gathering more information...")], + "upstream_report": report, + } + + async def thought_node(state: CodeAgentState) -> dict: + """Generate next thought/action using the LLM.""" + step_num = state.get("step", 0) + logger.info("thought_node: starting step %d", step_num) + runtime_prompt = state.get("runtime_prompt") or "You are a security analyst investigating a CVE." + messages = [SystemMessage(content=runtime_prompt)] + state["messages"] + with tracer.push_active_function("thought_node", input_data={}) as span: + obs = state.get("observation", None) + if obs is not None: + memory_list = obs.memory if obs.memory else ["No prior knowledge."] + recent_findings = obs.results if obs.results else ["No recent findings."] + memory_context = "\n".join(f"- {m}" for m in memory_list) + findings_context = "\n".join(f"- {f}" for f in recent_findings) + context_block = f"KNOWLEDGE:\n{memory_context}\nLATEST FINDINGS:\n{findings_context}" + messages.append(SystemMessage(content=context_block)) + response: CheckerThought = await thought_llm.ainvoke(messages) + if response.mode == "finish": + ai_message = AIMessage(content=response.final_answer) + else: + tool_name = response.actions.tool + arguments = response.actions.query + tool_call_id = str(uuid.uuid4()) + ai_message = AIMessage( + content=response.thought, + tool_calls=[{"name": tool_name, "args": {"query": arguments}, "id": tool_call_id}] + ) + span.set_output({ + "thought": response.thought, + "mode": response.mode, + "actions": response.actions, + "final_answer": response.final_answer, + }) + return { + "messages": [ai_message], + "thought": response, + "step": step_num + 1, + "max_steps": config.max_iterations, + } + + async def forced_finish_node(state: CodeAgentState) -> dict: + """Force finish when max iterations reached. + + Invokes the LLM with FORCED_FINISH_PROMPT to generate a final answer + based on evidence gathered so far. + """ + step_num = state.get("step", 0) + with tracer.push_active_function("forced_finish_node", input_data=f"step:{step_num}") as span: + try: + active_prompt = state.get("runtime_prompt") + messages = [SystemMessage(content=active_prompt)] + state["messages"] + messages.append(HumanMessage(content=FORCED_FINISH_PROMPT)) + + obs = state.get("observation") + if obs is not None and obs.memory: + memory_context = "\n".join(f"- {m}" for m in obs.memory) + messages.append(SystemMessage(content=f"KNOWLEDGE:\n{memory_context}")) + + response: CheckerThought = await thought_llm.ainvoke(messages) + + if response.mode == "finish" and response.final_answer: + ai_message = AIMessage(content=response.final_answer) + final_answer = response.final_answer + else: + final_answer = "Failed to generate a final answer within the maximum allowed steps." + ai_message = AIMessage(content=final_answer) + response = CheckerThought( + thought=response.thought or "Max steps exceeded", + mode="finish", + actions=None, + final_answer=final_answer, + ) + + span.set_output({"final_answer_length": len(final_answer), "step": step_num}) + return { + "messages": [ai_message], + "thought": response, + "step": step_num, + "max_steps": state.get("max_steps", config.max_iterations), + "observation": state.get("observation"), + "output": final_answer, + } + except Exception as e: + logger.exception("forced_finish_node failed at step %d", step_num) + span.set_output({"error": str(e), "exception_type": type(e).__name__, "step": step_num}) + raise + + async def observation_node(state: CodeAgentState) -> dict: + """Process tool output: comprehension -> memory update with VulnerabilityIntel context.""" + logger.info("observation_node: starting") + tool_message = state["messages"][-1] + last_thought = state.get("thought") + if not last_thought: + return { + "messages": [AIMessage(content="No thought found")], + } + last_thought_text = last_thought.thought + tool_used = last_thought.actions.tool + tool_input_detail = last_thought.actions.query + previous_memory = state.get("observation").memory if state.get("observation") else ["No data gathered yet."] + + vulnerability_intel = state.get("vulnerability_intel") + intel_formatted = vulnerability_intel.format_for_prompt() if vulnerability_intel else "No intel available" + target_package_name = target_package.name if target_package else "unknown" + + with tracer.push_active_function("observation node", input_data=f"tool used:{tool_used} + {tool_input_detail}") as span: + tool_output_for_llm = tool_message.content + + # Check for empty/error outputs - bypass LLM if so to prevent hallucination + empty_findings = check_empty_output(tool_output_for_llm, tool_used, tool_input_detail) + if empty_findings: + code_findings = empty_findings + else: + # Get parsed_patch from state for raw diff context + # Reports may be Pydantic models or dicts depending on state serialization + downstream_report = state.get("downstream_report") + upstream_report = state.get("upstream_report") + parsed_patch = None + + if downstream_report: + if isinstance(downstream_report, dict): + parsed_patch = downstream_report.get('parsed_patch') + else: + parsed_patch = getattr(downstream_report, 'parsed_patch', None) + + if not parsed_patch and upstream_report: + if isinstance(upstream_report, dict): + parsed_patch = upstream_report.get('fixed_parsed_patch') + else: + parsed_patch = getattr(upstream_report, 'fixed_parsed_patch', None) + + # If parsed_patch is a dict, convert it to ParsedPatch model + if parsed_patch and isinstance(parsed_patch, dict): + try: + parsed_patch = ParsedPatch(**parsed_patch) + except Exception as e: + logger.warning("Failed to parse parsed_patch dict: %s", e) + parsed_patch = None + + logger.debug("observation_node: parsed_patch=%s, downstream=%s, upstream=%s", + parsed_patch is not None, + downstream_report is not None, + upstream_report is not None) + + # Extract relevant hunks based on grep target file + raw_patch_diff = "" + if tool_used == "Source Grep" and parsed_patch: + raw_patch_diff = get_relevant_hunks(parsed_patch, tool_input_detail) + + # Step 1: Comprehension - extract key findings from raw tool output + comp_prompt = L1_COMPREHENSION_PROMPT.format( + vuln_id=vuln_id, + target_package=target_package_name, + vulnerability_intel=intel_formatted, + raw_patch_diff=raw_patch_diff, + tool_used=tool_used, + tool_input=tool_input_detail, + last_thought=last_thought_text, + tool_output=truncate_tool_output(tool_output_for_llm, tool_used, max_tokens=1000), + ) + code_findings: CodeFindings = await comprehension_llm.ainvoke([SystemMessage(content=comp_prompt)]) + findings_text = "\n".join(f"- {f}" for f in code_findings.findings) + + # Step 2: Memory update - merge findings into cumulative memory + mem_prompt = L1_MEMORY_UPDATE_PROMPT.format( + vuln_id=vuln_id, + target_package=target_package_name, + previous_memory="\n".join(f"- {m}" for m in previous_memory) if isinstance(previous_memory, list) else previous_memory, + findings=findings_text, + tool_outcome=code_findings.tool_outcome, + ) + new_observation: Observation = await observation_llm.ainvoke([SystemMessage(content=mem_prompt)]) + + messages = state["messages"] + active_prompt = state.get("runtime_prompt") + estimated = _estimate_tokens(active_prompt, messages, new_observation) + prune_messages = [] + orig_estimated = estimated + + if estimated > config.context_window_token_limit: + l_tool_count = _count_tokens(tool_output_for_llm) + for msg in messages: + prune_messages.append(RemoveMessage(id=msg.id)) + estimated -= _count_tokens(msg.content) if hasattr(msg, "content") and isinstance(msg.content, str) else 0 + if estimated <= config.context_window_token_limit: + break + logger.info( + "Context pruning: removed %d messages, estimated tokens now ~%d (limit %d)", + len(prune_messages), estimated, config.context_window_token_limit, + ) + + span.set_output({ + "last_thought_text": last_thought_text, + "tool_output_for_llm": tool_output_for_llm[:500], + "findings": code_findings.findings, + "tool_outcome": code_findings.tool_outcome, + "new_memory": new_observation.memory, + "amount_of_orig_tokens": orig_estimated, + "amount_of_estimated_tokens": estimated, + }) + return { + "messages": prune_messages, + "observation": new_observation, + } + + async def should_continue(state: CodeAgentState) -> str: + thought = state.get("thought", None) + if thought is not None and thought.mode == "finish": + return END + if state.get("step", 0) >= state.get("max_steps", config.max_iterations): + return FORCED_FINISH_NODE + return TOOL_NODE + + def should_continue_after_intel(state: CodeAgentState) -> str: + """Route after L1_agent: skip ReAct loop if architecture mismatch detected.""" + if state.get("arch_mismatch_reason"): + return END + return THOUGHT_NODE + + flow = StateGraph(CodeAgentState) + + flow.add_node(DOWNSTREAM_SEARCH_NODE, downstream_search) + flow.add_node(GATHER_MORE_INFO_NODE, gather_more_info) + flow.add_node(L1_AGENT_NODE, L1_agent) + flow.add_node(THOUGHT_NODE, thought_node) + flow.add_node(TOOL_NODE, tools_node) + flow.add_node(FORCED_FINISH_NODE, forced_finish_node) + flow.add_node(OBSERVATION_NODE, observation_node) + + flow.add_edge(START, DOWNSTREAM_SEARCH_NODE) + flow.add_conditional_edges(DOWNSTREAM_SEARCH_NODE, should_continue_downstream, { + L1_AGENT_NODE: L1_AGENT_NODE, + GATHER_MORE_INFO_NODE: GATHER_MORE_INFO_NODE, + }) + flow.add_edge(GATHER_MORE_INFO_NODE, L1_AGENT_NODE) + flow.add_conditional_edges(L1_AGENT_NODE, should_continue_after_intel, { + END: END, + THOUGHT_NODE: THOUGHT_NODE, + }) + flow.add_conditional_edges( + THOUGHT_NODE, + should_continue, + {END: END, TOOL_NODE: TOOL_NODE, FORCED_FINISH_NODE: FORCED_FINISH_NODE} + ) + flow.add_edge(TOOL_NODE, OBSERVATION_NODE) + flow.add_edge(OBSERVATION_NODE, THOUGHT_NODE) + flow.add_edge(FORCED_FINISH_NODE, END) + + + app = flow.compile() + return app + + +@register_function(config_type=CVEPackageCodeAgentConfig, framework_wrappers=[LLMFrameworkEnum.LANGCHAIN]) +async def cve_package_code_agent(config: CVEPackageCodeAgentConfig, builder: Builder): + + async def _arun(message: AgentMorpheusEngineInput) -> AgentMorpheusEngineInput: + """Run L1 investigation and return intermediate result for routing to L2 or report generation.""" + trace_id.set(message.input.scan.id) + tracer = Context.get() + + # Set ctx_state so tools (e.g., Source Grep, Lexical Search) can access checker_context + from types import SimpleNamespace + workflow_state = SimpleNamespace(original_input=message, info=message.info) + ctx_state.set(workflow_state) + + logger.info("package_code_agent: starting L1 investigation") + + l1_agent_graph = await create_graph_code_agent(config, builder, message, tracer) + initial_state: CodeAgentState = { + "messages": [HumanMessage(content="Begin L1 CVE investigation")], + "step": 0, + "max_steps": config.max_iterations, + } + + with tracer.push_active_function("l1_agent_graph", input_data=initial_state["messages"][0].content): + result = await l1_agent_graph.ainvoke( + initial_state, + config={"recursion_limit": config.max_iterations * 4}, + ) + + logger.info("package_code_agent: L1 investigation finished") + + # Check for early architecture mismatch exit + arch_mismatch_reason = result.get("arch_mismatch_reason") + if arch_mismatch_reason: + logger.info( + "package_code_agent: Architecture mismatch detected - %s", + arch_mismatch_reason + ) + l1_result = L1InvestigationResult( + downstream_report=None, + upstream_report=None, + l1_agent_answer=arch_mismatch_reason, + vulnerability_intel=result.get("vulnerability_intel"), + preliminary_verdict="not_present", + confidence=1.0, + ) + if message.info.checker_context is not None: + message.info.checker_context.l1_result = l1_result + return message + + final_answer = None + thought = result.get("thought") + if thought and thought.mode == "finish": + final_answer = thought.final_answer + + vuln_id = message.input.scan.vulns[0].vuln_id + target_package = message.input.image.target_package + target_package_name = target_package.name if target_package else "unknown" + + llm = await builder.get_llm(llm_name=config.llm_name, wrapper_type=LLMFrameworkEnum.LANGCHAIN) + verdict_extraction = await extract_l1_verdict( + llm=llm, + vuln_id=vuln_id, + target_package=target_package_name, + final_answer=final_answer or "No final answer produced.", + tracer=tracer, + ) + preliminary_verdict = verdict_extraction.preliminary_verdict + confidence = verdict_extraction.confidence + + downstream_report: DownstreamSearchReport | None = result.get("downstream_report") + upstream_report: UpstreamSearchReport | None = result.get("upstream_report") + vulnerability_intel: VulnerabilityIntel | None = result.get("vulnerability_intel") + + l1_result = L1InvestigationResult( + downstream_report=downstream_report.model_dump() if downstream_report else None, + upstream_report=upstream_report.model_dump() if upstream_report else None, + l1_agent_answer=final_answer, + vulnerability_intel=vulnerability_intel, + preliminary_verdict=preliminary_verdict, + confidence=confidence, + ) + + with tracer.push_active_function( + "l1_agent_finish", + input_data={"preliminary_verdict": preliminary_verdict}, + ) as span: + span.set_output({ + "l1_agent_answer": final_answer[:500] if final_answer else None, + "vulnerability_intel": vulnerability_intel, + "confidence": l1_result.confidence, + }) + + if message.info.checker_context is not None: + message.info.checker_context.l1_result = l1_result + else: + logger.warning("package_code_agent: checker_context is None, cannot store l1_result") + logger.info( + "package_code_agent: L1 result - verdict=%s, confidence=%.2f", + preliminary_verdict, + l1_result.confidence, + ) + return message + + yield FunctionInfo.from_fn( + _arun, + description="Level 1 Package Code Agent: investigates CVEs using extracted source and Tantivy code index", + ) diff --git a/src/vuln_analysis/functions/cve_source_acquisition.py b/src/vuln_analysis/functions/cve_source_acquisition.py new file mode 100644 index 000000000..eabb2ded4 --- /dev/null +++ b/src/vuln_analysis/functions/cve_source_acquisition.py @@ -0,0 +1,242 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import hashlib +import json +from datetime import datetime, timezone +from aiq.builder.builder import Builder +from aiq.builder.framework_enum import LLMFrameworkEnum +from aiq.builder.function_info import FunctionInfo +from aiq.cli.register_workflow import register_function +from aiq.data_models.function import FunctionBaseConfig +from pydantic import Field + +import shutil +from pathlib import Path +from pathlib import PurePath + +from exploit_iq_commons.data_models.checker_status import PackageCheckerContext, PackageCheckerStatus, PackageIdentifyResult +from exploit_iq_commons.data_models.checker_status import AcquiredArtifacts +from exploit_iq_commons.logging.loggers_factory import LoggingFactory + +from exploit_iq_commons.utils.source_rpm_downloader import RPMDependencyManager, SourceRPMDownloader +from vuln_analysis.utils.package_identifier import PackageIdentifier +from vuln_analysis.tools.brew_downloader import BrewDownloader, BrewDownloaderError, resolve_brew_profile +from vuln_analysis.functions.cve_calculate_intel_score import CVECalculateIntelScoreConfig + +logger = LoggingFactory.get_agent_logger(__name__) + + +def _artifacts_from_cache( + target_dir: Path, + source_dir: Path, + log_file: Path, + metadata_file: Path, +) -> AcquiredArtifacts: + """Populate AcquiredArtifacts from an on-disk checker cache directory.""" + artifacts = AcquiredArtifacts() + artifacts.srpm_path = source_dir + if log_file.exists(): + artifacts.build_log_path = log_file + binaries_dir = target_dir / "binaries" + if binaries_dir.exists(): + artifacts.binary_rpm_path = binaries_dir + if metadata_file.exists(): + try: + metadata = json.loads(metadata_file.read_text(encoding="utf-8")) + artifacts.source_url = metadata.get("source_url") + except (json.JSONDecodeError, OSError) as e: + logger.warning("Failed to read metadata.json: %s", e) + return artifacts + + +def _update_metadata_arch(metadata_file: Path, arch: str) -> None: + """Update metadata.json to add a new arch entry to arch_artifacts.""" + if not metadata_file.exists(): + return + + try: + metadata = json.loads(metadata_file.read_text(encoding="utf-8")) + if "arch_artifacts" not in metadata: + metadata["arch_artifacts"] = {} + + metadata["arch_artifacts"][arch] = { + "build_log_downloaded_at": datetime.now(timezone.utc).isoformat(), + } + + metadata_file.write_text(json.dumps(metadata, indent=2), encoding="utf-8") + logger.info("Updated metadata with arch=%s", arch) + except (json.JSONDecodeError, OSError) as e: + logger.warning("Failed to update metadata.json with arch=%s: %s", arch, e) + + +class CVESourceAcquisitionConfig(FunctionBaseConfig, name="cve_source_acquisition"): + """ + Downloads source containers, extracts layers, and locates package sources + by purl and ecosystem. Populates the pipeline state with source paths for + downstream checker segmentation and investigation nodes. + """ + base_git_dir: str = Field( + default=".cache/am_cache/git", + description="The directory for storing pulled git repositories used for code analysis.", + ) + base_pickle_dir: str = Field( + default=".cache/am_cache/pickle", + description="The directory used for storing pickled document cache files.", + ) + base_rpm_dir: str = Field( + default=".cache/am_cache/rpms", + description="The directory used for storing rpm files.", + ) + base_checker_dir: str = Field( + default=".cache/am_cache/checker", + description="Root directory for checker-specific artifacts (extracted sources, diffs, results).", + ) + rpm_user_type: str = Field( + default="internal", + description=( + "Brew profile: internal (Red Hat VPN) or external (Fedora public Koji). " + "Overridden by RPM_USER_TYPE environment variable when set." + ), + ) + + +@register_function(config_type=CVESourceAcquisitionConfig, framework_wrappers=[LLMFrameworkEnum.LANGCHAIN]) +async def cve_source_acquisition(config: CVESourceAcquisitionConfig, builder: Builder): + from exploit_iq_commons.data_models.input import AgentMorpheusEngineInput + + async def _arun(message: AgentMorpheusEngineInput) -> AgentMorpheusEngineInput: + logger.info("source_acquisition: starting source code acquisition") + + rpm_manager = RPMDependencyManager.get_instance() + rpm_manager.set_rpm_cache_dir(config.base_rpm_dir) + message.info.checker_context = PackageCheckerContext() + intel_list = message.info.intel or [] + vulns = message.input.scan.vulns + + intel_by_vuln = {i.vuln_id: i for i in intel_list} + target_package = message.input.image.target_package + + identifier = PackageIdentifier( + target_package=target_package, + ) + + status = PackageCheckerStatus.OK + per_vuln_results: dict[str, PackageIdentifyResult] = {} + + intel_score_config = builder.get_function_config("cve_calculate_intel_score") + assert isinstance(intel_score_config, CVECalculateIntelScoreConfig) + + for vuln_info in vulns: + intel = intel_by_vuln.get(vuln_info.vuln_id) + + if intel_score_config.generate_intel_score and intel: + score = intel.get_intel_score() + if score < intel_score_config.intel_low_score and not intel_score_config.insist_analysis: + logger.info("Intel score %d below threshold %d for %s - skipping", + score, intel_score_config.intel_low_score, vuln_info.vuln_id) + status = PackageCheckerStatus.PKG_INTEL_LOW_SCORE + break + + status, result = identifier.identify(intel) + message.info.checker_context.identify_result = result + break + + + message.info.checker_context.status = status + if status != PackageCheckerStatus.OK: + return message + + # create identifier key + str_identifier_key = f"{target_package.name}-{target_package.version}-{target_package.release}" + identifier_key = hashlib.sha256(str_identifier_key.encode()).hexdigest()[:16] + message.info.checker_context.source_key = identifier_key + + target_dir = Path(config.base_checker_dir) / identifier_key + arch = target_package.arch + + source_dir = target_dir / "source" + log_file = target_dir / "logs" / arch / "build.log" + metadata_file = target_dir / "metadata.json" + + source_exists = source_dir.exists() and any(source_dir.iterdir()) + log_exists = log_file.exists() + + brew_profile = resolve_brew_profile(config.rpm_user_type) + brew_downloader = BrewDownloader(brew_profile, config.base_rpm_dir, str(target_dir)) + + if source_exists and (log_exists or not brew_downloader.auto_fetch_build_log): + logger.info("Full cache hit for %s (arch=%s): %s", identifier_key, arch, target_dir) + message.info.checker_context.artifacts = _artifacts_from_cache( + target_dir, source_dir, log_file, metadata_file, + ) + return message + + if source_exists and not log_exists and brew_downloader.auto_fetch_build_log: + logger.info("Partial cache hit for %s - downloading log for arch=%s", identifier_key, arch) + artifacts = _artifacts_from_cache(target_dir, source_dir, log_file, metadata_file) + + try: + brew_downloader.connect() + build = brew_downloader.search_build( + target_package.name, target_package.version, target_package.release, + ) + if build: + artifacts.build_log_path = brew_downloader.try_download_build_log(build, arch) + if artifacts.build_log_path: + _update_metadata_arch(metadata_file, arch) + logger.info("Downloaded build log for arch=%s", arch) + except BrewDownloaderError as e: + logger.warning("Failed to fetch build for build log arch=%s: %s", arch, e) + + message.info.checker_context.artifacts = artifacts + return message + + # Full cache miss - download everything + logger.info("Full cache miss for %s (arch=%s)", identifier_key, arch) + target_dir.mkdir(parents=True, exist_ok=True) + try: + brew_downloader.connect() + artifacts = brew_downloader.download_target_artifacts( + target_package.name, target_package.version, target_package.release, arch, + ) + message.info.checker_context.artifacts = artifacts + + nvr = f"{target_package.name}-{target_package.version}-{target_package.release}" + arch_entry: dict[str, str] = {} + if artifacts.build_log_path: + arch_entry["build_log_downloaded_at"] = datetime.now(timezone.utc).isoformat() + metadata = { + "source_url": artifacts.source_url, + "nvr": nvr, + "downloaded_at": datetime.now(timezone.utc).isoformat(), + "arch_artifacts": {arch: arch_entry} if arch_entry else {}, + } + metadata_file.write_text(json.dumps(metadata, indent=2), encoding="utf-8") + logger.info("Wrote metadata to %s", metadata_file) + except BrewDownloaderError as e: + logger.error("Failed to download patched SRPM: %s", e) + message.info.checker_context.status = PackageCheckerStatus.ERROR_FAILED_TO_DOWNLOAD_SRPM + return message + + + + return message + + yield FunctionInfo.from_fn( + _arun, + input_schema=AgentMorpheusEngineInput, + description="Downloads source containers and locates package sources by purl and ecosystem.", + ) diff --git a/src/vuln_analysis/functions/react_internals.py b/src/vuln_analysis/functions/react_internals.py index 803400675..192b08e4c 100644 --- a/src/vuln_analysis/functions/react_internals.py +++ b/src/vuln_analysis/functions/react_internals.py @@ -62,6 +62,34 @@ class Thought(BaseModel): max_length=3000, ) + +class CheckerToolCall(BaseModel): + """Tool call for RPM checker flow - simpler schema with just query.""" + tool: str = Field(description="Exact tool name from AVAILABLE_TOOLS") + query: str = Field(description="Search pattern for Source Grep or Code Keyword Search") + reason: str = Field(description="Briefly explain why this tool helps the investigation") + + +class CheckerThought(BaseModel): + """Thought model for RPM checker flow with simplified tool call schema.""" + thought: str = Field( + description="Brief reasoning about next step (max 3-4 sentences)", + max_length=3000, + ) + mode: Literal["act", "finish"] = Field( + description="'act' to call tools, 'finish' to return final answer" + ) + actions: CheckerToolCall | None = Field( + default=None, + description="When mode is 'act', the tool to execute" + ) + final_answer: str | None = Field( + default=None, + description="When mode is 'finish', concise answer with evidence", + max_length=3000, + ) + + class CodeFindings(BaseModel): """Compressed code comprehension output from raw tool results.""" findings: list[str] = Field( @@ -73,6 +101,47 @@ class CodeFindings(BaseModel): ) +def check_empty_output( + tool_output: str | list, + tool_used: str, + tool_input: str, +) -> CodeFindings | None: + """Check if tool output is empty or an error, returning factual CodeFindings if so. + + This bypasses LLM comprehension for empty/error outputs to prevent hallucination. + + Returns: + CodeFindings with factual empty/error message, or None if output has content. + """ + is_empty = ( + not tool_output + or (isinstance(tool_output, str) and tool_output.strip() in ("[]", "")) + or (isinstance(tool_output, list) and len(tool_output) == 0) + ) + + is_error = ( + isinstance(tool_output, str) + and any(m in tool_output for m in ["Error:", "error:", "Failed:", "Exception:", "Traceback"]) + ) + + if is_empty: + return CodeFindings( + findings=[f"{tool_used} for '{tool_input}' returned empty - no matches found"], + tool_outcome=f"CALLED: {tool_used} with {tool_input} -> EMPTY (no results)" + ) + + if is_error: + return CodeFindings( + findings=[ + f"FAILED: {tool_used} [{tool_input}] - tool error", + f"Details: {str(tool_output)[:150]}" + ], + tool_outcome=f"FAILED: {tool_used} with {tool_input} -> ERROR" + ) + + return None + + class Observation(BaseModel): results: list[str] = Field( description="3-5 key technical facts from this tool output. Each fact must describe what the code DOES and how it relates to the investigation goal, not just that it was found." @@ -90,6 +159,25 @@ class Classification(BaseModel): ) +class L1VerdictExtraction(BaseModel): + """Lightweight structured output for extracting verdict from L1 final answer.""" + preliminary_verdict: str = Field( + description=( + "Classify the L1 agent's conclusion: " + "'protected' if fix/patch applied or code mitigated, " + "'not_present' if vulnerable code not found in this version, " + "'vulnerable' if vulnerable code confirmed present, " + "'uncertain' if evidence is insufficient or conflicting" + ) + ) + confidence: float = Field( + description="Confidence in the verdict (0.0 to 1.0) based on evidence strength in the answer" + ) + reasoning: str = Field( + description="Brief explanation of why this verdict was chosen" + ) + + class PackageSelection(BaseModel): """Structured output for selecting the most relevant package from multiple candidates.""" selected_package: str = Field( diff --git a/src/vuln_analysis/register.py b/src/vuln_analysis/register.py index fe3b4b755..03ca69448 100644 --- a/src/vuln_analysis/register.py +++ b/src/vuln_analysis/register.py @@ -23,9 +23,12 @@ from aiq.data_models.function import FunctionBaseConfig from pydantic import Field +from exploit_iq_commons.data_models.common import PipelineMode +from exploit_iq_commons.data_models.checker_status import PackageCheckerStatus, PACKAGE_CHECKER_STATUS_DESCRIPTIONS from exploit_iq_commons.data_models.input import AgentMorpheusEngineInput from exploit_iq_commons.data_models.input import AgentMorpheusInput -from vuln_analysis.data_models.output import AgentMorpheusOutput +from exploit_iq_commons.data_models.info import AgentMorpheusInfo +from vuln_analysis.data_models.output import AgentMorpheusEngineOutput, AgentMorpheusOutput, JustificationOutput, OutputPayload from vuln_analysis.data_models.state import AgentMorpheusEngineState # pylint: disable=unused-import from vuln_analysis.functions import cve_agent @@ -36,8 +39,13 @@ from vuln_analysis.functions import cve_generate_vdbs from vuln_analysis.functions import cve_http_output from vuln_analysis.functions import cve_justify +from vuln_analysis.functions import cve_package_code_agent +from vuln_analysis.functions import cve_checker_segmentation +from vuln_analysis.functions import cve_source_acquisition from vuln_analysis.functions import cve_process_sbom from vuln_analysis.functions import cve_summarize +from vuln_analysis.functions import cve_checker_report +from vuln_analysis.functions import cve_build_agent from vuln_analysis.functions import cve_generate_cvss from vuln_analysis.functions import cve_generate_vex from vuln_analysis.functions import health_endpoint @@ -49,6 +57,7 @@ from vuln_analysis.tools import serp from vuln_analysis.tools import configuration_scanner from vuln_analysis.tools import import_usage_analyzer +from vuln_analysis.tools import source_grep from vuln_analysis.utils.error_handling_decorator import catch_pipeline_errors_async # pylint: enable=unused-import from vuln_analysis.utils.llm_engine_utils import postprocess_engine_output, finalize_preprocess_engine_input @@ -77,6 +86,26 @@ class CVEAgentWorkflowConfig(FunctionBaseConfig, name="cve_agent"): description="Function to output workflow results " "(e.g. cve_file_output, cve_http_output). " " If None, only prints to console") + cve_source_acquisition_name: str | None = Field( + default=None, + description="Function name for source acquisition (downloads source containers, locates package sources)", + ) + cve_checker_segmentation_name: str | None = Field( + default=None, + description="Function name for scoped code indexing of extracted checker sources (Tantivy only)", + ) + cve_package_code_agent_name: str | None = Field( + default=None, + description="Function name for the Level 1 Package Code Agent (source-level CVE investigation)", + ) + cve_checker_report_name: str | None = Field( + default=None, + description="Function name for the checker report generation (L1/L2 report synthesis)", + ) + cve_build_agent_name: str | None = Field( + default=None, + description="Function name for the Level 2 Build Agent (build compilation and hardening check)", + ) description: str = Field(default="Vulnerability analysis for container security workflow", description="Workflow function description") @@ -101,6 +130,26 @@ async def cve_agent_workflow(config: CVEAgentWorkflowConfig, builder: Builder): cve_generate_vex_fn = builder.get_function(name=config.cve_generate_vex_name) cve_generate_cvss_fn = builder.get_function(name=config.cve_generate_cvss_name) cve_output_fn = builder.get_function(name=config.cve_output_config_name) if config.cve_output_config_name else None + cve_source_acquisition_fn = ( + builder.get_function(name=config.cve_source_acquisition_name) + if config.cve_source_acquisition_name else None + ) + cve_checker_segmentation_fn = ( + builder.get_function(name=config.cve_checker_segmentation_name) + if config.cve_checker_segmentation_name else None + ) + cve_package_code_agent_fn = ( + builder.get_function(name=config.cve_package_code_agent_name) + if config.cve_package_code_agent_name else None + ) + cve_checker_report_fn = ( + builder.get_function(name=config.cve_checker_report_name) + if config.cve_checker_report_name else None + ) + cve_build_agent_fn = ( + builder.get_function(name=config.cve_build_agent_name) + if config.cve_build_agent_name else None + ) # Define langgraph node functions @catch_pipeline_errors_async @@ -185,7 +234,24 @@ async def output_results_node(state: AgentMorpheusOutput) -> AgentMorpheusOutput """Outputs results using configured output function""" return await cve_output_fn.ainvoke(state.model_dump()) if cve_output_fn else state - + + # --- Package checker path nodes --- + + @catch_pipeline_errors_async + async def checker_init_state_node(state: AgentMorpheusInput) -> AgentMorpheusEngineInput: + """Bridges AgentMorpheusInput -> AgentMorpheusEngineInput with empty info (skips VDB generation).""" + return AgentMorpheusEngineInput(input=state, info=AgentMorpheusInfo()) + + @catch_pipeline_errors_async + async def checker_fetch_intel_node(state: AgentMorpheusEngineInput) -> AgentMorpheusEngineInput: + """Fetch intel for CVE input (package checker path). Reuses the same fetch_intel function.""" + return await cve_fetch_intel_fn.ainvoke(state.model_dump()) + + @catch_pipeline_errors_async + async def checker_calculate_intel_score_node(state: AgentMorpheusEngineInput) -> AgentMorpheusEngineInput: + """Calculate intel score for CVE input (package checker path).""" + return await cve_calculate_intel_score_fn.ainvoke(state.model_dump()) + async def check_vdbs_success(state: AgentMorpheusInput) -> str: """Checks if the VDBs were successfully generated""" if state.code_index_success: @@ -198,7 +264,138 @@ async def failure_node(state: AgentMorpheusInput) -> AgentMorpheusOutput: from exploit_iq_commons.data_models.info import AgentMorpheusInfo from vuln_analysis.data_models.output import OutputPayload return AgentMorpheusOutput(input=state, info=AgentMorpheusInfo(), output=OutputPayload(analysis=[], vex=None)) - # define langgraph + + + + @catch_pipeline_errors_async + async def source_acquisition_node(state: AgentMorpheusEngineInput) -> AgentMorpheusEngineInput: + """Acquires source code for the target package (source containers, git fallback).""" + if cve_source_acquisition_fn: + state = await cve_source_acquisition_fn.ainvoke(state.model_dump()) + else: + logger.warning("Source acquisition function not configured, passing state through") + + if state.info.checker_context and state.info.checker_context.status is not None: + logger.info( + "PackageIdentify aggregate status: %s", + state.info.checker_context.status.name, + ) + return state + + @catch_pipeline_errors_async + async def checker_segmentation_node(state: AgentMorpheusEngineInput) -> AgentMorpheusEngineInput: + """Builds scoped Tantivy code index from extracted checker sources.""" + if cve_checker_segmentation_fn: + state = await cve_checker_segmentation_fn.ainvoke(state.model_dump()) + else: + logger.warning("Checker segmentation not configured, skipping indexing") + return state + + @catch_pipeline_errors_async + async def l1_code_agent_node(state: AgentMorpheusEngineInput) -> AgentMorpheusEngineInput: + """Level 1 Package Code Agent: investigates CVEs using extracted source and Tantivy code index. + + Returns AgentMorpheusEngineInput with l1_result populated on checker_context. + """ + if cve_package_code_agent_fn: + return await cve_package_code_agent_fn.ainvoke(state.model_dump()) + logger.warning("Package code agent function not configured, passing state through") + return state + + def route_after_l1(state: AgentMorpheusEngineInput) -> str: + """Route to L2 Build Agent if vulnerable or uncertain, else to report generation.""" + ctx = state.info.checker_context + if ctx and ctx.l1_result: + verdict = ctx.l1_result.preliminary_verdict + if verdict in ("vulnerable", "uncertain"): + return "l2_build_agent" + return "generate_report" + + @catch_pipeline_errors_async + async def l2_build_agent_node(state: AgentMorpheusEngineInput) -> AgentMorpheusEngineInput: + """Level 2 Build Agent: BuildCompilationCheck + HardeningCheck. + + Returns AgentMorpheusEngineInput with l2_result populated on checker_context. + """ + if cve_build_agent_fn: + return await cve_build_agent_fn.ainvoke(state.model_dump()) + logger.warning("Build agent function not configured, passing state through") + return state + + @catch_pipeline_errors_async + async def generate_report_node(state: AgentMorpheusEngineInput) -> AgentMorpheusOutput: + """Generate the final checker report from L1/L2 investigation results.""" + if cve_checker_report_fn: + return await cve_checker_report_fn.ainvoke(state.model_dump()) + logger.warning("Checker report function not configured, producing empty output") + return AgentMorpheusOutput( + input=state.input, + info=state.info, + output=OutputPayload(analysis=[], vex=None), + ) + + @catch_pipeline_errors_async + async def checker_early_exit_node(state: AgentMorpheusEngineInput) -> AgentMorpheusOutput: + """Produces a proper output when source_acquisition exits with a non-OK status.""" + ctx = state.info.checker_context + status = ctx.status if ctx else None + + # Build detailed reason from identification result when available + detailed_reason = "" + if ctx and ctx.identify_result and ctx.identify_result.conclusion_reason: + detailed_reason = ctx.identify_result.conclusion_reason + + base_reason = ( + PACKAGE_CHECKER_STATUS_DESCRIPTIONS[status] + if status is not None and status in PACKAGE_CHECKER_STATUS_DESCRIPTIONS + else f"Checker exited early with status {status}" + ) + + # Combine base reason with detailed conclusion if available + reason = f"{base_reason}. {detailed_reason}" if detailed_reason else base_reason + logger.info("checker_early_exit: status=%s reason=%s", status, reason) + def _get_justification_label(s: PackageCheckerStatus | None) -> str: + if s in (PackageCheckerStatus.PKG_IDENT_NOT_VUL, PackageCheckerStatus.PKG_IDENT_CVE_MISMATCH): + return "not_vulnerable" + if s == PackageCheckerStatus.PKG_INTEL_LOW_SCORE: + return "poor_quality_intel" + return "error" + + analysis = [ + AgentMorpheusEngineOutput( + vuln_id=v.vuln_id, + checklist=[], + summary=reason, + justification=JustificationOutput( + label=_get_justification_label(status), + reason=reason, + status="FALSE" if status in ( + PackageCheckerStatus.PKG_IDENT_NOT_VUL, + PackageCheckerStatus.PKG_IDENT_CVE_MISMATCH, + ) else "UNKNOWN", + ), + intel_score=0, + cvss=None, + ) + for v in state.input.scan.vulns + ] + return AgentMorpheusOutput( + input=state.input, info=state.info, + output=OutputPayload(analysis=analysis, vex=None), + ) + + def route_after_source_acquisition(state: AgentMorpheusEngineInput): + """Route to checker_segmentation (happy path) or early exit on non-OK status.""" + ctx = state.info.checker_context + if ctx and ctx.status == PackageCheckerStatus.OK: + return "checker_segmentation" + return "checker_early_exit" + + def route_after_add_start_time(state: AgentMorpheusInput): + """Route to full pipeline or package checker based on pipeline_mode.""" + if state.image.pipeline_mode == PipelineMode.PACKAGE_CHECKER: + return "checker_init_state" + return "generate_vdbs" # build llm engine subgraph subgraph_builder = StateGraph(AgentMorpheusEngineState) @@ -242,8 +439,28 @@ async def call_llm_engine_subgraph_node(message: AgentMorpheusEngineInput): graph_builder.add_node("add_completed_time", add_completed_time_node) graph_builder.add_node("output_results", output_results_node) graph_builder.add_node("failure", failure_node) +# -- Package checker nodes -- + graph_builder.add_node("checker_init_state", checker_init_state_node) + graph_builder.add_node("checker_fetch_intel", checker_fetch_intel_node) + graph_builder.add_node("checker_calculate_intel_score", checker_calculate_intel_score_node) + graph_builder.add_node("source_acquisition", source_acquisition_node) + graph_builder.add_node("checker_early_exit", checker_early_exit_node) + graph_builder.add_node("checker_segmentation", checker_segmentation_node) + graph_builder.add_node("l1_code_agent", l1_code_agent_node) + graph_builder.add_node("l2_build_agent", l2_build_agent_node) + graph_builder.add_node("generate_report", generate_report_node) + graph_builder.add_edge(START, "add_start_time") - graph_builder.add_edge("add_start_time", "generate_vdbs") + # Conditional: route to full pipeline or package checker after add_start_time + graph_builder.add_conditional_edges( + "add_start_time", + route_after_add_start_time, + { + "generate_vdbs": "generate_vdbs", + "checker_init_state": "checker_init_state", + }, + ) + graph_builder.add_conditional_edges("generate_vdbs", check_vdbs_success,{"fetch_intel": "fetch_intel", "failure": "failure"}) graph_builder.add_edge("failure", "add_completed_time") #graph_builder.add_edge("generate_vdbs", "fetch_intel") @@ -252,10 +469,39 @@ async def call_llm_engine_subgraph_node(message: AgentMorpheusEngineInput): graph_builder.add_edge("process_sbom", "check_vuln_deps") graph_builder.add_edge("check_vuln_deps", "llm_engine") graph_builder.add_edge("llm_engine", "add_completed_time") + + # Package checker path + graph_builder.add_edge("checker_init_state", "checker_fetch_intel") + graph_builder.add_edge("checker_fetch_intel", "checker_calculate_intel_score") + graph_builder.add_edge("checker_calculate_intel_score", "source_acquisition") + + graph_builder.add_conditional_edges( + "source_acquisition", + route_after_source_acquisition, + { + "checker_segmentation": "checker_segmentation", + "checker_early_exit": "checker_early_exit", + }, + ) + graph_builder.add_edge("checker_early_exit", "add_completed_time") + graph_builder.add_edge("checker_segmentation", "l1_code_agent") + graph_builder.add_conditional_edges( + "l1_code_agent", + route_after_l1, + { + "l2_build_agent": "l2_build_agent", + "generate_report": "generate_report", + }, + ) + graph_builder.add_edge("l2_build_agent", "generate_report") + graph_builder.add_edge("generate_report", "add_completed_time") + + # Shared tail graph_builder.add_edge("add_completed_time", "output_results") graph_builder.add_edge("output_results", END) graph = graph_builder.compile() - + #graph.get_graph().draw_mermaid_png(output_file_path="checker_flow.png") + def convert_str_to_agent_morpheus_input(input: str) -> AgentMorpheusInput: logger.debug("Converting JSON string input to AgentMorpheusInput (length: %d)", len(input)) try: diff --git a/src/vuln_analysis/tools/brew_downloader.py b/src/vuln_analysis/tools/brew_downloader.py new file mode 100644 index 000000000..fb36b5d50 --- /dev/null +++ b/src/vuln_analysis/tools/brew_downloader.py @@ -0,0 +1,353 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Production Brew Downloader -- fetch SRPMs, build logs, and binary RPMs from Brew (Koji). + +Evolved from the PoC at docs/package_analyzer/standalone_checker/brew/brew_downloader.py. +Storage is split: SRPMs go to the shared rpms/ cache, everything else to checker-specific dirs. +""" + +from __future__ import annotations + +import os +import re +import shutil +from enum import Enum +from pathlib import Path + +import koji +import requests +import yaml + +from exploit_iq_commons.data_models.checker_status import AcquiredArtifacts +from exploit_iq_commons.logging.loggers_factory import LoggingFactory +from exploit_iq_commons.utils.source_rpm_downloader import SourceRPMDownloader + +logger = LoggingFactory.get_agent_logger(__name__) + +_CONFIGS_DIR = Path(__file__).resolve().parent.parent / "configs" / "brew" +# --------------------------------------------------------------------------- +# Exceptions +# --------------------------------------------------------------------------- + +class BrewDownloaderError(Exception): + """Base for all Brew downloader errors.""" + + +class BrewConnectionError(BrewDownloaderError): + """Raised when the Brew hub is unreachable or session creation fails.""" + + +class BrewBuildNotFoundError(BrewDownloaderError): + """Raised when getBuild returns None for the requested NVR.""" + + +class BrewDownloadError(BrewDownloaderError): + """Raised when an HTTP download of an artifact fails.""" + + +class BrewProfileNotImplementedError(BrewDownloaderError): + """Raised when a profile type is not yet implemented.""" + + +# --------------------------------------------------------------------------- +# Profile types +# --------------------------------------------------------------------------- + +class BrewProfileType(Enum): + INTERNAL = "internal" + EXTERNAL = "external" + + +_PROFILE_PATHS: dict[BrewProfileType, Path] = { + BrewProfileType.INTERNAL: _CONFIGS_DIR / "internal-user-profile.yml", + BrewProfileType.EXTERNAL: _CONFIGS_DIR / "external-user-profile.yml", +} + + +_ENV_DEFAULT_RE = re.compile(r"^\$\{([^:}]+):-([^}]+)\}$") + + +def resolve_brew_profile(config_value: str) -> BrewProfileType: + """Resolve brew profile from deployment config. + + Supports ``rpm_user_type: ${RPM_USER_TYPE:-internal}`` in YAML (expanded at + deploy, or parsed here when the literal is still present). ``RPM_USER_TYPE`` + environment variable always wins when set. + """ + env_override = os.environ.get("RPM_USER_TYPE") + if env_override is not None: + name = env_override.strip().lower() + else: + raw = config_value.strip() + match = _ENV_DEFAULT_RE.match(raw) + if match: + var_name, default = match.group(1), match.group(2) + name = os.environ.get(var_name, default).strip().lower() + else: + name = raw.lower() + try: + return BrewProfileType(name) + except ValueError as exc: + raise BrewProfileNotImplementedError( + f"Unknown brew profile '{name}' (expected internal or external)" + ) from exc + + +# --------------------------------------------------------------------------- +# BrewDownloader +# --------------------------------------------------------------------------- + +class BrewDownloader: + """Downloads RPM artifacts and build logs from Brew (Koji) using a profile YAML. + + Storage destinations: + - SRPMs -> ``rpm_cache_dir/{NVR}.src.rpm`` (shared with SourceRPMDownloader) + - Build logs -> ``checker_dir/logs/{arch}/build.log`` + - Binary RPMs -> ``checker_dir/binaries/{NVR}/{NVRA}.rpm`` + """ + + def __init__(self, profile_type: BrewProfileType, rpm_cache_dir: str, checker_dir: str) -> None: + profile_path = _PROFILE_PATHS.get(profile_type) + if profile_path is None: + raise BrewProfileNotImplementedError( + f"Profile type '{profile_type.value}' is not configured" + ) + self._profile = self._load_profile(str(profile_path)) + + hosts = self._profile["hosts"]["rpm"] + self._brew_hub: str = hosts["brew_hub"] + self._brew_download: str = hosts["brew_download"] + self._default_arch: str = self._profile.get("default_arch", "x86_64") + self._download_binary_rpm_enabled: bool = self._profile.get("download_binary_rpm", False) + self._auto_fetch_build_log: bool = self._profile.get("build_log", {}).get("auto_fetch", True) + self._ssl_verify: bool = self._profile.get("ssl_verify", True) + self._ssl_verify_path: str | None = self._profile.get("verify_path") + self._http_verify: bool | str = self._resolve_http_verify() + + self._rpm_cache_dir = Path(rpm_cache_dir) + self._rpm_cache_dir.mkdir(parents=True, exist_ok=True) + + self._checker_dir = Path(checker_dir) + self._checker_dir.mkdir(parents=True, exist_ok=True) + + self._session: koji.ClientSession | None = None + self._pathinfo: koji.PathInfo | None = None + self._http = requests.Session() + + # -- properties -------------------------------------------------------- + + @property + def download_binary_rpm_enabled(self) -> bool: + return self._download_binary_rpm_enabled + + @property + def default_arch(self) -> str: + return self._default_arch + + @property + def auto_fetch_build_log(self) -> bool: + return self._auto_fetch_build_log + + # -- setup ------------------------------------------------------------- + + @staticmethod + def _load_profile(path: str) -> dict: + with open(path, encoding="utf-8") as fh: + return yaml.safe_load(fh) + + def _resolve_http_verify(self) -> bool | str: + if not self._ssl_verify: + return False + if self._ssl_verify_path: + return self._ssl_verify_path + return True + + def connect(self) -> None: + """Create a Koji client session and PathInfo helper from the profile.""" + logger.info("Connecting to Brew hub: %s", self._brew_hub) + try: + opts: dict = {} + if not self._ssl_verify: + opts["no_ssl_verify"] = True + elif self._ssl_verify_path: + opts["serverca"] = self._ssl_verify_path + self._session = koji.ClientSession(self._brew_hub, opts=opts) + self._pathinfo = koji.PathInfo(topdir=self._brew_download) + self._http.verify = self._http_verify + + except Exception as exc: + raise BrewConnectionError( + f"Failed to connect to Brew hub {self._brew_hub}: {exc}" + ) from exc + + # -- query ------------------------------------------------------------- + + def search_build(self, name: str, version: str, release: str) -> dict | None: + """Look up a build by NVR. Returns the build-info dict or ``None``.""" + nvr = f"{name}-{version}-{release}" + logger.info("Searching for build: %s", nvr) + build = self._session.getBuild(nvr) + if build is None: + logger.warning("Build not found: %s", nvr) + return None + logger.info( + "Found build %s (id=%s, volume=%s, task=%s)", + build["nvr"], build["id"], build.get("volume_name"), build.get("task_id"), + ) + return build + + # -- downloads --------------------------------------------------------- + + def _download_file(self, url: str, dest: Path) -> Path: + """Stream-download *url* to *dest*. Returns the destination path.""" + logger.info("Downloading %s -> %s", url, dest) + dest.parent.mkdir(parents=True, exist_ok=True) + try: + resp = self._http.get( + url, stream=True, timeout=120, verify=self._http_verify + ) + resp.raise_for_status() + except requests.RequestException as exc: + raise BrewDownloadError(f"Failed to download {url}: {exc}") from exc + with open(dest, "wb") as fh: + for chunk in resp.iter_content(chunk_size=1 << 18): # 256 KB + fh.write(chunk) + logger.info("Saved %s (%d bytes)", dest.name, dest.stat().st_size) + return dest + + def _get_srpm_url(self, build: dict) -> str: + """Compute the download URL for the source RPM of *build*.""" + rpms = self._session.listRPMs(buildID=build["id"], arches="src") + if not rpms: + raise BrewDownloadError(f"No source RPM found for build {build['nvr']}") + rpm_info = rpms[0] + return f"{self._pathinfo.build(build)}/{self._pathinfo.rpm(rpm_info)}" + + def download_srpm(self, build: dict) -> Path: + """Download the .src.rpm for *build* into the shared RPM cache. + + Skips the download when the destination file already exists and is non-empty. + """ + rpms = self._session.listRPMs(buildID=build["id"], arches="src") + if not rpms: + raise BrewDownloadError(f"No source RPM found for build {build['nvr']}") + + rpm_info = rpms[0] + dest = self._rpm_cache_dir / f"{rpm_info['nvr']}.src.rpm" + + if dest.exists() and dest.stat().st_size > 0: + logger.info("SRPM cache hit: %s", dest) + return dest + + url = f"{self._pathinfo.build(build)}/{self._pathinfo.rpm(rpm_info)}" + return self._download_file(url, dest) + + def download_build_log(self, build: dict, arch: str | None = None) -> Path: + """Download ``build.log`` for the given arch into ``checker_dir/logs/{arch}/``.""" + arch = arch or self._default_arch + url = f"{self._pathinfo.build(build)}/data/logs/{arch}/build.log" + dest = self._checker_dir / "logs" / arch / "build.log" + return self._download_file(url, dest) + + def try_download_build_log(self, build: dict, arch: str | None = None) -> Path | None: + """Download build log when ``build_log.auto_fetch`` is enabled; return None on failure.""" + if not self._auto_fetch_build_log: + return None + arch = arch or self._default_arch + try: + return self.download_build_log(build, arch) + except BrewDownloadError as exc: + logger.warning( + "Build log unavailable for %s arch=%s: %s", + build.get("nvr", "?"), + arch, + exc, + ) + return None + + def download_binary_rpm(self, build: dict, arch: str | None = None) -> Path | None: + """Download all binary RPMs for the given arch (excludes debuginfo/debugsource). + + Saves to ``checker_dir/binaries/{NVR}/``. Returns an empty list when no + matching RPMs are found. + """ + arch = arch or self._default_arch + rpms = self._session.listRPMs(buildID=build["id"], arches=arch) + if not rpms: + logger.warning("No %s RPMs found for build %s", arch, build["nvr"]) + return None + + nvr = build["nvr"] + build_dir = self._checker_dir / "binaries" / nvr + + downloaded: list[Path] = [] + for rpm_info in rpms: + rpm_name: str = rpm_info["name"] + if rpm_name.endswith(("-debuginfo", "-debugsource")): + continue + url = f"{self._pathinfo.build(build)}/{self._pathinfo.rpm(rpm_info)}" + nvra = f"{rpm_info['name']}-{rpm_info['version']}-{rpm_info['release']}.{rpm_info['arch']}" + dest = build_dir / f"{nvra}.rpm" + self._download_file(url, dest) + downloaded.append(dest) + return build_dir + + def download_patched_srpm(self, name: str, version: str, release: str) -> Path | None: + """Download the SRPM for a patched version (from CVE fix info). + + Returns the cached SRPM path, or ``None`` if the patched build is not + found in Brew. + """ + build = self.search_build(name, version, release) + if build is None: + return None + return self.download_srpm(build) + + def download_patched_srpm_by_nevra(self, nevra: str) -> Path | None: + """Download the SRPM for a patched version (from NEVRA). + + Returns the cached SRPM path, or ``None`` if the patched build is not + found in Brew. + """ + build = self._session.getBuild(nevra) + if build is None: + logger.warning("Build not found: %s", nevra) + return None + logger.info( + "Found build %s (id=%s, volume=%s, task=%s)", + build["nvr"], build["id"], build.get("volume_name"), build.get("task_id"), + ) + return self.download_srpm(build) + + def download_target_artifacts(self, name: str, version: str, release: str, arch: str) -> AcquiredArtifacts | None: + artifacts = AcquiredArtifacts() + build = self.search_build(name, version, release) + if build is None: + raise BrewBuildNotFoundError(f"Build not found for {name}-{version}-{release}") + + artifacts.source_url = self._get_srpm_url(build) + cache_srpm_path = self.download_srpm(build) + + srpm_target_path = self._checker_dir / "source" + srpm_target_path.mkdir(parents=True, exist_ok=True) + shutil.copy2(cache_srpm_path, srpm_target_path) + SourceRPMDownloader.extract_src_rpm(cache_srpm_path, srpm_target_path) + artifacts.srpm_path = srpm_target_path + + artifacts.build_log_path = self.try_download_build_log(build, arch) + if self._download_binary_rpm_enabled: + artifacts.binary_rpm_path = self.download_binary_rpm(build, arch) + return artifacts diff --git a/src/vuln_analysis/tools/lexical_full_search.py b/src/vuln_analysis/tools/lexical_full_search.py index 8117e990d..300887828 100644 --- a/src/vuln_analysis/tools/lexical_full_search.py +++ b/src/vuln_analysis/tools/lexical_full_search.py @@ -21,6 +21,7 @@ from pydantic import Field from exploit_iq_commons.logging.loggers_factory import LoggingFactory +from exploit_iq_commons.data_models.input import PipelineMode from vuln_analysis.utils.error_handling_decorator import catch_tool_errors LEXICAL_CODE_SEARCH = "lexical_code_search" @@ -33,6 +34,10 @@ class LexicalSearchToolConfig(FunctionBaseConfig, name=LEXICAL_CODE_SEARCH): Lexical search tool used to search source code. """ top_k: int = Field(default=5, description="Top K to use for the lexical search") + base_code_index_dir: str = Field( + default=".cache/am_cache/code_index", + description="Base directory for Tantivy code index storage.", + ) @register_function(config_type=LexicalSearchToolConfig, framework_wrappers=[LLMFrameworkEnum.LANGCHAIN]) @@ -43,8 +48,16 @@ async def lexical_search(config: LexicalSearchToolConfig, builder: Builder): # @catch_tool_errors(LEXICAL_CODE_SEARCH) async def _arun(query: str) -> str: workflow_state = ctx_state.get() - code_index_path = workflow_state.code_index_path - full_text_search = FullTextSearch.get_instance(cache_path=code_index_path) + + pipeline_mode = getattr(workflow_state.original_input.input.image, 'pipeline_mode', None) + + if pipeline_mode == PipelineMode.PACKAGE_CHECKER: + source_key = workflow_state.original_input.info.checker_context.source_key + code_index_path = str(FullTextSearch.get_index_directory(config.base_code_index_dir, source_key)) + else: + code_index_path = workflow_state.code_index_path + + full_text_search = FullTextSearch(cache_path=code_index_path) if full_text_search.is_empty(): logger.debug("Lexical search: index is empty at %s", code_index_path) diff --git a/src/vuln_analysis/tools/source_grep.py b/src/vuln_analysis/tools/source_grep.py new file mode 100644 index 000000000..38b25cf98 --- /dev/null +++ b/src/vuln_analysis/tools/source_grep.py @@ -0,0 +1,224 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Native Unix grep tool for fast source code searching. + +Provides an LLM-callable tool that uses native grep subprocess for +faster searching compared to Python-based regex scanning. +""" + +from pathlib import Path + +from aiq.builder.builder import Builder +from aiq.builder.framework_enum import LLMFrameworkEnum +from aiq.builder.function_info import FunctionInfo +from aiq.cli.register_workflow import register_function +from aiq.data_models.function import FunctionBaseConfig +from pydantic import Field + +from exploit_iq_commons.logging.loggers_factory import LoggingFactory +from vuln_analysis.tools.source_inspector import SourceInspector +from vuln_analysis.utils.error_handling_decorator import catch_tool_errors + +SOURCE_GREP = "source_grep" + +logger = LoggingFactory.get_agent_logger(__name__) + + +class SourceGrepToolConfig(FunctionBaseConfig, name=SOURCE_GREP): + """Fast grep search using native Unix grep subprocess.""" + + base_checker_dir: str = Field( + default=".cache/am_cache/checker", + description="Root directory for checker-specific artifacts.", + ) + max_results: int = Field( + default=50, + description="Maximum number of grep results to return.", + ) + context_lines: int = Field( + default=3, + description="Number of context lines around each match.", + ) + + +VALID_TARGETS = ("source", "logs", "patch") + +TARGET_EXTENSIONS: dict[str, list[str]] = { + "source": ["*.c", "*.h", "*.cpp", "*.hpp", "*.py", "*.go", "*.java", "*.spec", "*.cmake", "Makefile", "*.mk", "*.config"], + "logs": [], # empty = search all files + "patch": ["*.patch", "*.diff"], +} + + +def _parse_query(query: str) -> tuple[str | list[str], str | None, str, bool]: + """Parse query string into (pattern(s), file_glob, target, word_boundary). + + Supports formats: + - "pattern" -> search source (default) + - "pattern,*.c" -> search source, only .c files + - "target:pattern" -> search specific target + - "target:pattern,file_glob" -> search target with file filter + - "pattern -w" -> search with word boundary (whole words only) + - "target:pattern,file_glob -w" -> full format with word boundary + - "pattern1;pattern2,file.c" -> multiple patterns (only with file_glob) + + Valid targets: source, logs, patch + + Note: Multiple patterns (separated by ';') are only supported when + a file_glob is provided. This prevents overly broad multi-pattern searches. + """ + query = query.strip().strip('"').strip("'") + + word_boundary = False + if query.endswith(" -w"): + word_boundary = True + query = query[:-3].strip() + + target = "source" + if ":" in query: + prefix, rest = query.split(":", 1) + if prefix in VALID_TARGETS: + target = prefix + query = rest + + if "," in query: + parts = query.split(",", 1) + pattern_part = parts[0].strip() + file_glob = parts[1].strip() if len(parts) > 1 else None + + # Multi-pattern support: only when file_glob is provided + if file_glob and ";" in pattern_part: + patterns = [p.strip() for p in pattern_part.split(";") if p.strip()] + return patterns, file_glob, target, word_boundary + + return pattern_part, file_glob, target, word_boundary + + return query, None, target, word_boundary + + +def _format_results(pattern: str, matches: list, root: Path) -> str: + """Format grep results for LLM consumption.""" + if not matches: + return f"No matches found for '{pattern}'" + + lines = [f"Found {len(matches)} match(es) for '{pattern}':\n"] + for i, match in enumerate(matches, 1): + try: + rel_path = match.file_path.relative_to(root) + except ValueError: + rel_path = match.file_path + lines.append(f"{i}. {rel_path}:{match.match_line_number}") + lines.append(f" {match.full_text.strip()}") + lines.append("") + + return "\n".join(lines) + + +@register_function(config_type=SourceGrepToolConfig, framework_wrappers=[LLMFrameworkEnum.LANGCHAIN]) +async def source_grep(config: SourceGrepToolConfig, builder: Builder): # pylint: disable=unused-argument + from vuln_analysis.runtime_context import ctx_state + + @catch_tool_errors(SOURCE_GREP) + async def _arun(query: str) -> str: + """Search source code, build logs, or patches using native Unix grep. + + Query format: '[target:]pattern[,file_glob][ -w]' + + Targets: + - source (default): Package source code + - logs: Build compilation logs + - patch: Fixed patches from newer RPM version + + Options: + - -w: Match whole words only (word boundary) + - Multiple patterns: use ';' separator ONLY with a specific file + + Examples: + - 'archive_read_open' - search source files + - 'archive_read_open,*.c' - search only .c source files + - 'archive_read_open -w' - search for whole word only + - 'unsigned int cursor;unsigned int nodes,archive_read.c' - multiple patterns in one file + - 'logs:undefined reference' - search build logs for link errors + - 'logs:error:' - search build logs for error messages + - 'patch:CVE-2026-5121' - find patch for specific CVE + - 'patch:archive_read,*.patch' - search in patch files + """ + workflow_state = ctx_state.get() + + checker_context = None + if workflow_state.original_input and workflow_state.original_input.info: + checker_context = workflow_state.original_input.info.checker_context + + if checker_context is None or not checker_context.source_key: + raise ValueError("Checker context or source_key not available in workflow state") + + source_key = checker_context.source_key + pattern, file_glob, target, word_boundary = _parse_query(query) + + # For logs target, use arch-specific subdirectory + if target == "logs": + target_package = workflow_state.original_input.input.image.target_package + if target_package is None or not target_package.arch: + raise ValueError("logs target requires target_package.arch in workflow state") + + arch = target_package.arch + target_dir = (Path(config.base_checker_dir) / source_key / "logs" / arch).resolve() + + if not target_dir.is_dir(): + return f"No build log available for architecture '{arch}'. Build log may not have been fetched for this package." + else: + target_dir = (Path(config.base_checker_dir) / source_key / target).resolve() + + if not target_dir.is_dir(): + raise ValueError(f"Target directory does not exist: {target_dir}") + + inspector = SourceInspector(target_dir) + default_extensions = TARGET_EXTENSIONS.get(target, []) + + logger.info("Source grep: searching for '%s' in %s (target: %s, glob: %s, word_boundary: %s)", + pattern, target_dir, target, file_glob or "default extensions", word_boundary) + + matches = await inspector.grep_native( + patterns=pattern, + file_glob=file_glob, + word_boundary=word_boundary, + context_lines=config.context_lines, + max_results=config.max_results, + default_extensions=default_extensions, + ) + + logger.info("Source grep: found matches for '%s' in target '%s'", pattern, target) + return matches + + yield FunctionInfo.from_fn( + _arun, + description=( + "Fast grep search using native Unix grep. " + "Query format: '[target:]pattern[,file_glob][ -w]'. " + "Targets: 'source' (default) for package source code, " + "'logs' for build compilation logs, " + "'patch' for fixed patches from newer RPM. " + "Add ' -w' suffix for whole-word matching. " + "Multiple patterns: use ';' separator ONLY with a specific file, e.g., " + "'pattern1;pattern2,filename.c' searches for both patterns in that file. " + "Examples: 'archive_read_open' searches source, " + "'archive_read_open,*.c' searches only C source files, " + "'archive_read_open -w' searches for whole word only, " + "'unsigned int cursor;unsigned int nodes,archive_read.c' searches multiple patterns in one file, " + "'logs:undefined reference' searches build logs, " + "'patch:CVE-2026-5121' searches patch files." + ), + ) diff --git a/src/vuln_analysis/tools/source_inspector.py b/src/vuln_analysis/tools/source_inspector.py new file mode 100644 index 000000000..2df030882 --- /dev/null +++ b/src/vuln_analysis/tools/source_inspector.py @@ -0,0 +1,254 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Generic filesystem utility for inspecting extracted RPM source trees. + +Provides low-level primitives (find, grep, read) that can be composed by +pipeline code or called by an LLM agent in the future. +""" + +from __future__ import annotations + +import asyncio +import re +import shlex +import subprocess +from dataclasses import dataclass +from pathlib import Path + +from nat.builder.context import Context + + +@dataclass +class GrepMatch: + file_path: Path + line_number: int + line_content: str + + +class SourceInspector: + """Filesystem inspector scoped to a root directory. + + All returned paths are absolute. The class carries no domain-specific + logic (RPM, spec, changelog); callers compose the primitives for that. + """ + + def __init__(self, source_dir: Path) -> None: + if ".." in source_dir.parts: + raise ValueError(f"Path contains traversal components: {source_dir}") + self._root = source_dir.resolve() + if not self._root.is_dir(): + raise FileNotFoundError(f"source_dir does not exist: {self._root}") + + @property + def root(self) -> Path: + return self._root + + def find_files(self, pattern: str, recursive: bool = True) -> list[Path]: + """Glob over the source tree. + + Parameters + ---------- + pattern: + Shell glob pattern, e.g. ``"*.spec"`` or ``"*.patch"``. + recursive: + If *True* use ``**/`` (deep search). + If *False* use ```` (root-level only). + """ + glob_expr = f"**/{pattern}" if recursive else pattern + return sorted(self._root.glob(glob_expr)) + + def grep_content( + self, + pattern: str, + file_path: Path | None = None, + *, + recursive: bool = False, + ) -> list[GrepMatch]: + """Search file contents for a regex *pattern*. + + Parameters + ---------- + pattern: + Regular expression (case-sensitive by default). + file_path: + If given, search that file only, or (if it is a directory) every file + in that directory (one level, regular files only). + If the path does not exist, return no matches. + If *None*, search every file under *source_dir* + (depth controlled by *recursive*). + recursive: + Only used when *file_path* is ``None``. + ``False`` searches only root-level files; ``True`` walks the tree. + """ + regex = re.compile(pattern) + matches: list[GrepMatch] = [] + + if file_path is not None: + resolved = file_path.resolve() + if resolved.is_file(): + targets = [resolved] + elif resolved.is_dir(): + targets = sorted(p for p in resolved.iterdir() if p.is_file()) + else: + targets = [] + elif recursive: + targets = sorted(p for p in self._root.rglob("*") if p.is_file()) + else: + targets = sorted(p for p in self._root.iterdir() if p.is_file()) + + for fp in targets: + try: + lines = fp.read_text(encoding="utf-8", errors="replace").splitlines() + except (OSError, UnicodeDecodeError): + continue + for idx, line in enumerate(lines, start=1): + if regex.search(line): + matches.append(GrepMatch(file_path=fp, line_number=idx, line_content=line)) + return matches + + async def grep_native( + self, + patterns: str | list[str], + file_glob: str | None = None, + *, + case_insensitive: bool = False, + word_boundary: bool = False, + context_lines: int = 0, + max_results: int = 50, + default_extensions: list[str] | None = None, + ) -> str: + """Fast grep using native Unix grep subprocess. + + Parameters + ---------- + patterns: + Search pattern(s). Can be a single string or list of patterns. + When multiple patterns are provided, matches ANY of them (OR logic). + file_glob: + Optional file pattern (e.g., ``"*.c"``, ``"*.h"``). If provided, + overrides default_extensions. + case_insensitive: + If *True*, perform case-insensitive matching (``-i`` flag). + word_boundary: + If *True*, match whole words only (``-w`` flag). + context_lines: + Lines of context around match (``-C`` flag). Default 0. + max_results: + Stop after this many matches (``-m`` flag). Default 50. + default_extensions: + List of file extensions to search when file_glob is not provided. + If *None*, searches ALL files (no --include filter). + If empty list ``[]``, searches ALL files (no --include filter). + + Returns + ------- + str + Raw grep output with matches found. + """ + cmd = ["grep", "-rn", "-I"] + + if case_insensitive: + cmd.append("-i") + if word_boundary: + cmd.append("-w") + if context_lines > 0: + cmd.extend(["-C", str(context_lines)]) + + # If file_glob contains a path (e.g., "lib/connect.c"), split into: + # - path_filter: directory portion for post-filtering results (e.g., "lib/") + # - file_glob: filename only for --include (e.g., "connect.c") + path_filter = None + if file_glob and "/" in file_glob: + path_filter = file_glob.rsplit("/", 1)[0] + "/" + file_glob = file_glob.rsplit("/", 1)[1] + + if file_glob: + cmd.extend(["--include", file_glob]) + elif default_extensions is None: + pass # No filtering - search all files (caller should pass extensions explicitly) + elif default_extensions: + for ext in default_extensions: + cmd.extend(["--include", ext]) + + cmd.extend(["-m", str(max_results)]) + + # Handle single or multiple patterns + if isinstance(patterns, list): + for p in patterns: + cmd.extend(["-e", p]) + else: + cmd.extend(["-e", patterns]) + + cmd.append(".") + + def _run_grep() -> tuple[str, int]: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + errors="replace", + cwd=self._root, + timeout=30, + ) + return result.stdout, result.returncode + + try: + tracer = Context.get() + except Exception: + tracer = None + + if tracer: + with tracer.push_active_function("grep_native", input_data={"command": shlex.join(cmd), "path_filter": path_filter}) as span: + stdout, returncode = await asyncio.to_thread(_run_grep) + span.set_output({ + "return_code": returncode, + "match_count": stdout.count('\n') if stdout else 0, + }) + else: + stdout, returncode = await asyncio.to_thread(_run_grep) + + # Post-filter results to only include paths matching path_filter + if path_filter and stdout: + filtered_lines = [line for line in stdout.splitlines() if path_filter in line] + stdout = "\n".join(filtered_lines) + if filtered_lines: + stdout += "\n" + + return stdout + + def read_file( + self, + file_path: Path, + offset: int = 0, + max_lines: int | None = None, + ) -> str: + """Read file content starting from a line *offset*. + + Parameters + ---------- + file_path: + Absolute or relative path (resolved against *source_dir*). + offset: + 0-based line offset to start reading from. + max_lines: + Maximum number of lines to return. ``None`` means read to EOF. + """ + resolved = file_path if file_path.is_absolute() else (self._root / file_path) + if not resolved.resolve().is_relative_to(self._root.resolve()): + raise ValueError(f"Path escapes root directory: {file_path}") + lines = resolved.read_text(encoding="utf-8", errors="replace").splitlines() + end = (offset + max_lines) if max_lines is not None else len(lines) + return "\n".join(lines[offset:end]) diff --git a/src/vuln_analysis/tools/tool_names.py b/src/vuln_analysis/tools/tool_names.py index f07d40d50..02125e9f2 100644 --- a/src/vuln_analysis/tools/tool_names.py +++ b/src/vuln_analysis/tools/tool_names.py @@ -65,6 +65,9 @@ class ToolNames: IMPORT_USAGE_ANALYZER = "Import Usage Analyzer" """Finds all imports and usage patterns of a specific package across indexed sources.""" + SOURCE_GREP = "Source Grep" + """Fast grep search in source code using native Unix grep""" + # Module-level constants for convenience imports CODE_SEMANTIC_SEARCH = ToolNames.CODE_SEMANTIC_SEARCH @@ -78,6 +81,7 @@ class ToolNames: FUNCTION_LIBRARY_VERSION_FINDER = ToolNames.FUNCTION_LIBRARY_VERSION_FINDER CONFIGURATION_SCANNER = ToolNames.CONFIGURATION_SCANNER IMPORT_USAGE_ANALYZER = ToolNames.IMPORT_USAGE_ANALYZER +SOURCE_GREP = ToolNames.SOURCE_GREP @@ -94,4 +98,5 @@ class ToolNames: 'FUNCTION_LIBRARY_VERSION_FINDER', 'CONFIGURATION_SCANNER', 'IMPORT_USAGE_ANALYZER', + 'SOURCE_GREP', ] diff --git a/src/vuln_analysis/utils/clients/nvd_client.py b/src/vuln_analysis/utils/clients/nvd_client.py index 1b13ff51c..fd6a43d54 100644 --- a/src/vuln_analysis/utils/clients/nvd_client.py +++ b/src/vuln_analysis/utils/clients/nvd_client.py @@ -125,19 +125,21 @@ async def _get_cwe_elements(self, cve_obj: dict) -> dict: those CWEs. """ # Get CWE name - cwe_id = None + raw_cwe_id = None weaknesses = cve_obj.get('weaknesses', []) - cwe_id = self._get_cwe(weaknesses) + raw_cwe_id = self._get_cwe(weaknesses) cwe_link = None cwe_name = None cwe_description = None cwe_extended_description = None - if cwe_id is not None: - if cwe_id.startswith('CWE-'): - cwe_id = cwe_id.replace('CWE-', '', 1) + cwe_id_numeric = None + if raw_cwe_id is not None: + cwe_id_numeric = raw_cwe_id + if cwe_id_numeric.startswith('CWE-'): + cwe_id_numeric = cwe_id_numeric.replace('CWE-', '', 1) - if cwe_id.isnumeric(): - cwe_link = self._cwe_details_url_template.format(CWE_ID=cwe_id) + if cwe_id_numeric.isnumeric(): + cwe_link = self._cwe_details_url_template.format(CWE_ID=cwe_id_numeric) if cwe_link is not None: soup = await self._get_soup(cwe_link) @@ -155,7 +157,9 @@ async def _get_cwe_elements(self, cve_obj: dict) -> dict: if extended_description_div: cwe_extended_description = extended_description_div.find('div', class_='indent').text.strip() + cwe_id = f"CWE-{cwe_id_numeric}" if cwe_id_numeric and cwe_id_numeric.isnumeric() else raw_cwe_id return { + "cwe_id": cwe_id, "cwe_name": cwe_name, "cwe_description": cwe_description, "cwe_extended_description": cwe_extended_description, @@ -330,6 +334,7 @@ async def get_intel(self, cve_id: str) -> CveIntelNvd: cvss_vector=cvss_vector, cvss_base_score=cvss_base_score, cvss_severity=cvss_severity, + cwe_id=cwe_elements["cwe_id"], cwe_name=cwe_elements["cwe_name"], cwe_description=cwe_elements["cwe_description"], cwe_extended_description=cwe_elements["cwe_extended_description"], diff --git a/src/vuln_analysis/utils/full_text_search.py b/src/vuln_analysis/utils/full_text_search.py index 9ace44b9e..87247b67b 100644 --- a/src/vuln_analysis/utils/full_text_search.py +++ b/src/vuln_analysis/utils/full_text_search.py @@ -284,7 +284,7 @@ def add_documents_from_code_path(self, for root, _, files in os.walk(code_path): for file in files: - if any(file.endswith(ext) for ext in include_extensions): + if any(file.endswith(ext) for ext in include_extensions) or file in no_extension: file_path = os.path.join(root, file) try: with open(file_path, "r") as f: diff --git a/src/vuln_analysis/utils/gerrit_client.py b/src/vuln_analysis/utils/gerrit_client.py new file mode 100644 index 000000000..7a32489dd --- /dev/null +++ b/src/vuln_analysis/utils/gerrit_client.py @@ -0,0 +1,362 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Gerrit API client for Chromium bug resolution. + +This module provides functions to: +- Search Chromium Gerrit for CLs associated with a bug ID +- Filter for MERGED CLs only +- Select the correct CL when multiple exist (via LLM) +- Get commit SHA from a CL for Gitiles patch fetching +""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING + +import aiohttp +from pydantic import BaseModel + +from exploit_iq_commons.logging.loggers_factory import LoggingFactory +from vuln_analysis.utils.async_http_utils import request_with_retry + +if TYPE_CHECKING: + from langchain_core.language_models import BaseChatModel + +logger = LoggingFactory.get_agent_logger(__name__) + +# Chromium Gerrit API base URL +GERRIT_BASE_URL = "https://chromium-review.googlesource.com" +GERRIT_TIMEOUT_SECONDS = 30 + + +# --------------------------------------------------------------------------- +# Data Models +# --------------------------------------------------------------------------- + +class GerritChangeCandidate(BaseModel): + """Represents a candidate Gerrit CL for LLM selection.""" + submission_id: int # _number field from Gerrit API + project: str # e.g., "angle/angle", "chromium/src" + subject: str # Commit subject line + + +class GerritChangeSelection(BaseModel): + """LLM response for selecting the correct Gerrit CL.""" + submission_id: int | None # None = no matching fix found + reason: str # Explanation for the selection + + +# --------------------------------------------------------------------------- +# Response Parsing +# --------------------------------------------------------------------------- + +def parse_gerrit_response(text: str) -> list[dict] | dict: + """Parse Gerrit API response, stripping the XSSI prevention prefix. + + Gerrit API responses start with ")]}'" to prevent XSSI attacks. + + Args: + text: Raw response text from Gerrit API + + Returns: + Parsed JSON (list for search results, dict for single item endpoints) + """ + # Strip the XSSI prefix if present + if text.startswith(")]}'"): + text = text[4:].lstrip() + + return json.loads(text) + + +def list_merged_changes(raw_changes: list[dict]) -> list[GerritChangeCandidate]: + """Filter Gerrit changes to only MERGED status. + + Args: + raw_changes: List of change dicts from Gerrit API + + Returns: + List of GerritChangeCandidate for MERGED CLs only + """ + candidates = [] + has_seen = set() + dates = [] + for change in raw_changes: + if change.get("status") == "MERGED": + if change.get("_number") not in has_seen: + + candidates.append(GerritChangeCandidate( + submission_id=change.get("_number", 0), + project=change.get("project", ""), + subject=change.get("subject", ""), + )) + has_seen.add(change.get("_number")) + dates.append(change.get("updated")) + return candidates + + +# --------------------------------------------------------------------------- +# Gerrit API Functions +# --------------------------------------------------------------------------- + +async def search_changes_by_bug( + session: aiohttp.ClientSession, + bug_id: str, + n: int = 25, +) -> list[dict]: + """Search Gerrit for CLs associated with a Chromium bug ID. + + Args: + session: aiohttp session + bug_id: Chromium bug ID (digits only) + n: Maximum number of results to return + + Returns: + List of change dicts from Gerrit API + """ + url = f"{GERRIT_BASE_URL}/changes/" + params = { + "q": f"bug:{bug_id}", + "n": str(n), + } + + timeout = aiohttp.ClientTimeout(total=GERRIT_TIMEOUT_SECONDS) + try: + async with request_with_retry( + session=session, + request_kwargs={ + "method": "GET", + "url": url, + "params": params, + "timeout": timeout, + }, + max_retries=3, + sleep_time=0.5, + log_on_error=False, + ) as response: + text = await response.text() + data = parse_gerrit_response(text) + # Search endpoint returns a list + return data if isinstance(data, list) else [] + except aiohttp.ClientResponseError as e: + logger.warning("Gerrit search failed for bug %s: %s", bug_id, e) + return [] + except Exception as e: + logger.warning("Gerrit search error for bug %s: %s", bug_id, e) + return [] + + +async def get_current_commit_sha( + session: aiohttp.ClientSession, + change_number: int, +) -> str | None: + """Get the commit SHA for the current revision of a Gerrit CL. + + Args: + session: aiohttp session + change_number: Gerrit change number (_number field) + + Returns: + Commit SHA string, or None if not found + """ + url = f"{GERRIT_BASE_URL}/changes/{change_number}/revisions/current/commit" + + timeout = aiohttp.ClientTimeout(total=GERRIT_TIMEOUT_SECONDS) + try: + async with request_with_retry( + session=session, + request_kwargs={ + "method": "GET", + "url": url, + "timeout": timeout, + }, + max_retries=3, + sleep_time=0.5, + log_on_error=False, + ) as response: + text = await response.text() + data = parse_gerrit_response(text) + if isinstance(data, dict): + return data.get("commit") + return None + except aiohttp.ClientResponseError as e: + logger.warning("Failed to get commit SHA for CL %d: %s", change_number, e) + return None + except Exception as e: + logger.warning("Error getting commit SHA for CL %d: %s", change_number, e) + return None + + +# --------------------------------------------------------------------------- +# Gitiles URL Building +# --------------------------------------------------------------------------- + +def project_to_gitiles_repo_url(project: str) -> str: + """Convert Gerrit project path to Gitiles repository URL. + + Args: + project: Gerrit project path (e.g., "angle/angle", "chromium/src") + + Returns: + Full Gitiles repository URL + """ + return f"https://chromium.googlesource.com/{project}" + + +def build_gitiles_patch_url(repo_url: str, sha: str) -> str: + """Build a Gitiles URL for fetching a patch in TEXT format. + + The ^! suffix means "this commit only" (not including parents). + format=TEXT returns base64-encoded patch content. + + Args: + repo_url: Gitiles repository URL + sha: Commit SHA + + Returns: + Gitiles patch URL + """ + # URL-encode the ^! as %5E%21 + return f"{repo_url}/+/{sha}%5E%21?format=TEXT" + + +# --------------------------------------------------------------------------- +# LLM Selection +# --------------------------------------------------------------------------- + +def build_gerrit_cl_select_prompt( + candidates: list[GerritChangeCandidate], + cve_id: str, + cve_description: str, +) -> str: + """Build prompt for LLM to select the correct Gerrit CL. + + Args: + candidates: List of MERGED CL candidates + cve_id: CVE identifier + cve_description: CVE description for context + + Returns: + Formatted prompt string + """ + candidate_list = "\n".join( + f"- submission_id: {c.submission_id}, project: {c.project}, subject: {c.subject}" + for c in candidates + ) + + return f"""You are a security researcher analyzing Chromium CLs to find the fix for a CVE. + +CVE: {cve_id} +Description: {cve_description} + +The following MERGED Chromium CLs are associated with the bug tracker issue. +Select the one that implements the security fix. + +Candidates: +{candidate_list} + +Guidelines: +1. REJECT CLs with subjects starting with "Roll" - these are dependency updates, not fixes +2. REJECT CLs from "chromium/src" that only update DEPS - these are roll commits +3. PREFER CLs from upstream component projects (angle/angle, WebKit, etc.) + when the CVE mentions ANGLE, WebKit, etc. +4. Look for CLs with subjects mentioning the fix (bounds checks, memory handling, etc.) +5. If none of the CLs appear to be the actual security fix, return submission_id: null + +Return ONLY the submission_id of the best matching CL, or null if none are appropriate.""" + + +async def select_gerrit_change( + candidates: list[GerritChangeCandidate], + cve_id: str, + cve_description: str, + llm: "BaseChatModel | None" = None, +) -> int | None: + """Select the correct Gerrit CL from multiple MERGED candidates. + + Args: + candidates: List of MERGED CL candidates + cve_id: CVE identifier + cve_description: CVE description for context + llm: Optional LangChain LLM (base model, not pre-configured with structured output) + + Returns: + submission_id of selected CL, or None if no valid selection + """ + if not candidates: + return None + + if len(candidates) == 1: + # Only one candidate - return it directly + return candidates[0].submission_id + + if llm is None: + # No LLM available - apply heuristics + # Filter out Roll commits and chromium/src DEPS updates + non_roll = [ + c for c in candidates + if not c.subject.lower().startswith("roll ") + and not (c.project == "chromium/src" and "roll" in c.subject.lower()) + ] + + if len(non_roll) == 1: + return non_roll[0].submission_id + + # Prefer upstream component projects over chromium/src + upstream = [c for c in non_roll if c.project != "chromium/src"] + if len(upstream) == 1: + return upstream[0].submission_id + + # Can't decide without LLM - return None to fail safely + logger.warning( + "Multiple MERGED CLs for %s, no LLM available for selection: %s", + cve_id, + [c.submission_id for c in candidates] + ) + return None + + # Use LLM for selection with structured output + prompt = build_gerrit_cl_select_prompt(candidates, cve_id, cve_description) + + try: + # Configure LLM with structured output for GerritChangeSelection + gerrit_llm = llm.with_structured_output(GerritChangeSelection) + result = await gerrit_llm.ainvoke(prompt) + # with_structured_output guarantees the response type matches the schema + selection = result if isinstance(result, GerritChangeSelection) else GerritChangeSelection.model_validate(result) + + # Validate selection is in candidate list + if selection.submission_id is None: + logger.info("LLM returned no selection for %s: %s", cve_id, selection.reason) + return None + + valid_ids = {c.submission_id for c in candidates} + if selection.submission_id not in valid_ids: + logger.warning( + "LLM selected invalid submission_id %d for %s (valid: %s)", + selection.submission_id, cve_id, valid_ids + ) + return None + + logger.info( + "LLM selected CL %d for %s: %s", + selection.submission_id, cve_id, selection.reason + ) + return selection.submission_id + + except Exception as e: + logger.warning("LLM selection failed for %s: %s", cve_id, e) + return None diff --git a/src/vuln_analysis/utils/intel_utils.py b/src/vuln_analysis/utils/intel_utils.py index f7e6175ca..6a47db78b 100644 --- a/src/vuln_analysis/utils/intel_utils.py +++ b/src/vuln_analysis/utils/intel_utils.py @@ -22,10 +22,11 @@ from pydpkg import Dpkg from pydpkg.exceptions import DpkgVersionError -from exploit_iq_commons.data_models.cve_intel import CveIntelNvd +from exploit_iq_commons.data_models.cve_intel import CveIntelNvd,CveIntel from exploit_iq_commons.utils.data_utils import DEFAULT_GIT_DIRECTORY from exploit_iq_commons.utils.git_utils import sanitize_git_url_for_path + from exploit_iq_commons.logging.loggers_factory import LoggingFactory logger = LoggingFactory.get_agent_logger(__name__) @@ -600,6 +601,84 @@ def _strip_rejected_package_token(entry: str, rn: str) -> str: return re.sub(_package_token_boundary_pattern(rn), "", entry) +_COMMIT_URL_KEYWORDS = frozenset({ + "github.com/", "gitlab.com/", "gitlab.", "bitbucket.org/", + "/commit/", "/commits/", "/pull/", "/merge_requests/", + ".git", "git.kernel.org", "git.savannah", "cgit", +}) + +# Chromium issue tracker URL pattern - captures the bug ID +CHROMIUM_ISSUE_PATTERN = re.compile(r"https?://issues\.chromium\.org/issues/(\d+)") + + +def extract_commit_url_candidates(intel: CveIntel) -> dict[str, list[str]]: + """Extract URLs from intel references that may contain commit/patch information. + + Scans GHSA, NVD, RHSA, and Ubuntu references for URLs containing keywords + that suggest they may point to source code commits or patches. Also includes + Chromium issue tracker URLs (issues.chromium.org/issues/) which can be + resolved to patches via Gerrit/Gitiles. + + Args: + intel: CveIntel object containing intel from various providers + + Returns: + Dict mapping provider name to list of matching URLs. + Example: {"ghsa": ["https://github.com/foo/bar/commit/abc123"], "nvd": []} + """ + + def _matches_keywords(url: str) -> bool: + url_lower = url.lower() + return any(kw in url_lower for kw in _COMMIT_URL_KEYWORDS) + + def _is_chromium_issue(url: str) -> bool: + return bool(CHROMIUM_ISSUE_PATTERN.match(url)) + + def _extract_refs(refs: list[str] | None) -> list[str]: + if not refs: + return [] + return [r for r in refs if isinstance(r, str) and (_matches_keywords(r) or _is_chromium_issue(r))] + + result: dict[str, list[str]] = {} + + # GHSA references (extra field, may not exist) + if intel.ghsa: + ghsa_refs = getattr(intel.ghsa, "references", None) + result["ghsa"] = _extract_refs(ghsa_refs) + + # NVD references (explicit field) + if intel.nvd: + result["nvd"] = _extract_refs(intel.nvd.references) + + # RHSA references (extra field, may not exist) + if intel.rhsa: + rhsa_refs = getattr(intel.rhsa, "references", None) + # RHSA references may be newline-separated strings + if isinstance(rhsa_refs, list): + flat_refs = [] + for ref in rhsa_refs: + if isinstance(ref, str) and "\n" in ref: + flat_refs.extend(ref.split("\n")) + else: + flat_refs.append(ref) + result["rhsa"] = _extract_refs(flat_refs) + else: + result["rhsa"] = [] + + # Ubuntu references (explicit field) + if intel.ubuntu: + ubuntu_refs = getattr(intel.ubuntu, "references", None) + result["ubuntu"] = _extract_refs(ubuntu_refs) + # Also include patches field URLs + if intel.ubuntu.patches: + patch_urls = [] + for pkg_patches in intel.ubuntu.patches.values(): + patch_urls.extend(pkg_patches or []) + result["ubuntu_patches"] = _extract_refs(patch_urls) + + return result + + def filter_context_to_package(critical_context: list[str], selected: str, all_candidates: list[dict]) -> list[str]: """Narrow CVE intel strings to the user-selected package after disambiguation. Intel lines are built from GHSA, RHSA... and may mention several modules; this removes diff --git a/src/vuln_analysis/utils/osv_patch_retriever.py b/src/vuln_analysis/utils/osv_patch_retriever.py new file mode 100644 index 000000000..4eaaaf105 --- /dev/null +++ b/src/vuln_analysis/utils/osv_patch_retriever.py @@ -0,0 +1,570 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""OSV Patch Retriever - fetch upstream fix patches from OSV when RPM patches are unavailable. + +.. deprecated:: + This module is deprecated. Use :mod:`vuln_analysis.utils.web_patch_fetcher` instead. + The new module provides a unified interface for fetching patches from: + - Intel references (GHSA, NVD, RHSA, Ubuntu) + - OSV API + - kernel.org (in addition to GitHub) + + Migration guide: + - Replace `OSVPatchRetriever` with `OSVClient` from `web_patch_fetcher` + - Replace `fetch_ubuntu_patch` with `WebPatchFetcher.fetch_from_intel_refs` +""" + +import warnings + +warnings.warn( + "osv_patch_retriever is deprecated, use web_patch_fetcher instead", + DeprecationWarning, + stacklevel=2, +) + +from __future__ import annotations + +import os +import re +from typing import TYPE_CHECKING + +import aiohttp +from pydantic import BaseModel +from unidiff import PatchSet + +from exploit_iq_commons.logging.loggers_factory import LoggingFactory +from vuln_analysis.utils.async_http_utils import request_with_retry +from vuln_analysis.functions.code_agent_graph_defs import OSVPatchResult + +if TYPE_CHECKING: + from vuln_analysis.functions.code_agent_graph_defs import ParsedPatch + +logger = LoggingFactory.get_agent_logger(__name__) + +_OSV_API_URL = os.environ.get("OSV_API_URL", "https://api.osv.dev/v1/vulns/") +_OSV_TIMEOUT_SECONDS = int(os.environ.get("OSV_TIMEOUT_SECONDS", "10")) +_GITHUB_PATCH_TIMEOUT_SECONDS = int(os.environ.get("GITHUB_PATCH_TIMEOUT_SECONDS", "30")) + +_BINARY_FILE_EXTENSIONS = frozenset({ + '.uu', '.uue', '.iso', '.bin', '.gz', '.bz2', '.xz', '.zip', '.tar', '.tgz', '.tbz2', + '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.ico', '.webp', + '.pdf', '.doc', '.docx', '.xls', '.xlsx', + '.exe', '.dll', '.so', '.dylib', '.a', '.o', '.obj', + '.pyc', '.pyo', '.class', '.jar', '.war', +}) + +_GITHUB_REPO_PATTERN = re.compile(r"https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$") + + +class OSVAffectedRange(BaseModel): + """Represents a Git range from an OSV affected block.""" + repo_url: str | None = None + fixed_commit: str | None = None + introduced_commit: str | None = None + + + +def _is_binary_file_path(path: str) -> bool: + """Check if file path has a binary file extension.""" + path_lower = path.lower() + return any(path_lower.endswith(ext) for ext in _BINARY_FILE_EXTENSIONS) + + +def _version_in_range(version: str, introduced: str | None, fixed: str | None) -> bool: + """Check if provided upstream version falls within [introduced, fixed) range. + + Returns True if version >= introduced (or introduced is None) AND version < fixed (or fixed is None). + """ + try: + from packaging.version import parse as parse_version + v = parse_version(version) + if introduced: + try: + if v < parse_version(introduced): + return False + except Exception: + pass + if fixed: + try: + if v >= parse_version(fixed): + return False + except Exception: + pass + return True + except Exception: + return True + + +def _parse_patch_content(patch_content: str, patch_filename: str) -> "ParsedPatch | None": + """Parse patch content string into structured ParsedPatch model. + + Reuses the same logic as code_agent_graph_defs.parse_patch_file but works on string content. + """ + from vuln_analysis.functions.code_agent_graph_defs import ParsedPatch, PatchFile, PatchHunk + + try: + patch_set = PatchSet.from_string(patch_content) + except Exception: + logger.warning("_parse_patch_content: failed to parse patch content") + return None + + files: list[PatchFile] = [] + for patched_file in patch_set: + if patched_file.is_binary_file: + continue + if _is_binary_file_path(patched_file.target_file): + continue + + hunks: list[PatchHunk] = [] + for hunk in patched_file: + context, removed, added = [], [], [] + for line in hunk: + if line.is_context: + context.append(str(line.value).rstrip("\n")) + elif line.is_removed: + removed.append(str(line.value).rstrip("\n")) + elif line.is_added: + added.append(str(line.value).rstrip("\n")) + + hunks.append(PatchHunk( + source_start=hunk.source_start, + source_length=hunk.source_length, + target_start=hunk.target_start, + target_length=hunk.target_length, + context_lines=context, + removed_lines=removed, + added_lines=added, + )) + + files.append(PatchFile( + source_path=patched_file.source_file, + target_path=patched_file.target_file, + hunks=hunks, + is_new_file=patched_file.is_added_file, + is_deleted_file=patched_file.is_removed_file, + )) + + return ParsedPatch(patch_filename=patch_filename, files=files) + + +def _extract_commit_metadata(patch_content: str) -> tuple[str | None, str | None, str | None]: + """Extract commit message, author, and date from GitHub .patch format. + + GitHub .patch format starts with: + From Mon Sep 17 00:00:00 2001 + From: Author Name + Date: Tue, 1 Jan 2024 12:00:00 +0000 + Subject: [PATCH] Commit message + + Extended commit message... + --- + + """ + lines = patch_content.split('\n') + author = None + date = None + subject_lines = [] + in_subject = False + + for line in lines: + if line.startswith('From:'): + author = line[5:].strip() + elif line.startswith('Date:'): + date = line[5:].strip() + elif line.startswith('Subject:'): + in_subject = True + subject_part = line[8:].strip() + if subject_part.startswith('[PATCH'): + idx = subject_part.find(']') + if idx != -1: + subject_part = subject_part[idx + 1:].strip() + subject_lines.append(subject_part) + elif in_subject: + if line.startswith('---') or line.startswith('diff --git'): + break + if line.strip() == '': + in_subject = False + else: + subject_lines.append(line.strip()) + + commit_message = ' '.join(subject_lines).strip() if subject_lines else None + return commit_message, author, date + + +_UBUNTU_PATCH_URL_PATTERN = re.compile( + r"(?:upstream:\s*)?(https://github\.com/[^/]+/[^/]+/commit/([a-f0-9]+))" +) + + +async def fetch_ubuntu_patch( + session: aiohttp.ClientSession, + cve_id: str, + package_name: str, + patch_refs: list[str], + timeout: int = _GITHUB_PATCH_TIMEOUT_SECONDS, +) -> OSVPatchResult | None: + """Fetch patch from Ubuntu intel patches field. + + .. deprecated:: + Use :class:`vuln_analysis.utils.web_patch_fetcher.WebPatchFetcher.fetch_from_intel_refs` instead. + + Parses Ubuntu patch refs like: "upstream: https://github.com/curl/curl/commit/39d1976b7f..." + + Args: + session: aiohttp ClientSession for HTTP requests + cve_id: CVE identifier (e.g., "CVE-2024-1234") + package_name: Package name (for logging) + patch_refs: List of patch reference strings from Ubuntu intel + timeout: Timeout for GitHub fetch in seconds + + Returns: + OSVPatchResult with parsed patch data, or None if no valid patch found + """ + github_timeout = aiohttp.ClientTimeout(total=timeout) + + for patch_ref in patch_refs: + match = _UBUNTU_PATCH_URL_PATTERN.search(patch_ref) + if not match: + logger.debug("Ubuntu patch ref does not match GitHub commit pattern: %s", patch_ref) + continue + + commit_url = match.group(1) + commit_sha = match.group(2) + patch_url = f"{commit_url}.patch" + repo_url = commit_url.rsplit("/commit/", 1)[0] + + logger.info("Ubuntu: Fetching patch for %s from %s", cve_id, patch_url) + + try: + async with request_with_retry( + session=session, + request_kwargs={ + 'method': 'GET', + 'url': patch_url, + 'timeout': github_timeout, + }, + max_retries=3, + sleep_time=0.5, + log_on_error=False, + ) as response: + patch_content = await response.text() + except aiohttp.ClientResponseError as e: + if e.status == 404: + logger.info("Ubuntu: GitHub patch not found: %s", patch_url) + else: + logger.warning("Ubuntu: GitHub patch fetch failed: %s - %s", patch_url, e) + continue + except Exception as e: + logger.warning("Ubuntu: GitHub patch fetch failed: %s - %s", patch_url, e) + continue + + if not patch_content: + continue + + commit_message, commit_author, commit_date = _extract_commit_metadata(patch_content) + parsed_patch = _parse_patch_content(patch_content, f"{cve_id}_{commit_sha[:8]}.patch") + + if parsed_patch: + logger.info("Ubuntu: Successfully fetched and parsed patch for %s", cve_id) + return OSVPatchResult( + cve_id=cve_id, + fixed_commit=commit_sha[:8], + repo_url=repo_url, + patch_url=patch_url, + patch_content=patch_content, + parsed_patch=parsed_patch, + commit_message=commit_message, + commit_author=commit_author, + commit_date=commit_date, + ) + + logger.info("Ubuntu: No valid GitHub commit patch found for %s in %d refs", cve_id, len(patch_refs)) + return None + + +class OSVPatchRetriever: + """Retrieve upstream fix patches from OSV when RPM patches are unavailable. + + .. deprecated:: + Use :class:`vuln_analysis.utils.web_patch_fetcher.OSVClient` instead. + The new class provides the same functionality with additional support + for kernel.org and better integration with intel references. + + Usage: + async with aiohttp.ClientSession() as session: + retriever = OSVPatchRetriever(session=session) + result = await retriever.get_fix_patch("CVE-2024-1234", "3.0.7", "openssl") + if result and result.parsed_patch: + # Use result.parsed_patch for agent context + pass + """ + + def __init__( + self, + session: aiohttp.ClientSession, + osv_timeout: int = _OSV_TIMEOUT_SECONDS, + github_timeout: int = _GITHUB_PATCH_TIMEOUT_SECONDS, + ): + warnings.warn( + "OSVPatchRetriever is deprecated, use OSVClient from web_patch_fetcher instead", + DeprecationWarning, + stacklevel=2, + ) + self._session = session + self._osv_timeout = aiohttp.ClientTimeout(total=osv_timeout) + self._github_timeout = aiohttp.ClientTimeout(total=github_timeout) + + async def get_fix_patch( + self, + cve_id: str, + upstream_version: str, + package_name: str | None = None, + ) -> OSVPatchResult | None: + """Main entry point - orchestrates the full workflow. + + Args: + cve_id: CVE identifier (e.g., "CVE-2024-1234") + upstream_version: Upstream version from TargetPackage.version (e.g., "3.0.7") + package_name: Optional package name to help match the correct affected block + + Returns: + OSVPatchResult with patch data, or None if no fix found + """ + try: + osv_data = await self._query_osv(cve_id) + if not osv_data: + return None + + # 1. Try to get the highly-specific patch URL from references first + patch_url = self._extract_commit_from_references(osv_data) + fixed_commit = None + repo_url = None + if patch_url: + # Extract repo_url and fixed_commit from the patch_url for the result object + repo_url = patch_url.split('/commit/')[0] if '/commit/' in patch_url else patch_url.split('/pull/')[0] + fixed_commit = patch_url.split('/')[-1].replace('.patch', '') + logger.info("OSV: Found precise fix commit in references for %s", cve_id) + else: + # second try to find the fix commit from the affected block + affected = self._find_matching_affected(osv_data, package_name) + if not affected: + logger.info("OSV: No affected block with fix found for %s", cve_id) + return None + + range_info = self._extract_fix_commit(affected) + if not range_info.fixed_commit or not range_info.repo_url: + logger.info("OSV: No fixed commit found for %s", cve_id) + return None + + patch_url = self._build_patch_url(range_info.repo_url, range_info.fixed_commit) + fixed_commit = range_info.fixed_commit[:8] + repo_url = range_info.repo_url + if not patch_url: + logger.info("OSV: Could not build patch URL for %s (non-GitHub repo?)", cve_id) + return None + + patch_content = await self._fetch_github_patch(patch_url) + if not patch_content: + return None + + commit_message, commit_author, commit_date = _extract_commit_metadata(patch_content) + parsed_patch = _parse_patch_content(patch_content, f"{cve_id}_{fixed_commit}.patch") + + return OSVPatchResult( + cve_id=cve_id, + fixed_commit=fixed_commit, + repo_url=repo_url, + patch_url=patch_url, + patch_content=patch_content, + parsed_patch=parsed_patch, + commit_message=commit_message, + commit_author=commit_author, + commit_date=commit_date, + ) + + except Exception: + logger.warning("OSV patch retrieval failed for %s", cve_id, exc_info=True) + return None + + async def _query_osv(self, cve_id: str) -> dict | None: + """Query OSV API for CVE data. + + Args: + cve_id: CVE identifier + + Returns: + OSV vulnerability data dict, or None on failure + """ + url = f"{_OSV_API_URL}{cve_id}" + try: + async with request_with_retry( + session=self._session, + request_kwargs={ + 'method': 'GET', + 'url': url, + 'timeout': self._osv_timeout, + }, + max_retries=3, + sleep_time=0.5, + log_on_error=False, + ) as response: + return await response.json() + except aiohttp.ClientResponseError as e: + if e.status == 404: + logger.info("OSV: CVE %s not found", cve_id) + else: + logger.warning("OSV query failed for %s: %s", cve_id, e) + return None + except Exception as e: + logger.warning("OSV query failed for %s: %s", cve_id, e) + return None + + def _extract_commit_from_references(self, osv_data: dict) -> str | None: + """Attempt to find the exact fix commit URL from the OSV references array. + + Args: + osv_data: OSV vulnerability data dict + + Returns: + The patch URL if found, otherwise None + """ + references = osv_data.get("references", []) + + for ref in references: + if ref.get("type") == "FIX": + url = ref.get("url", "") + # We look for GitHub URLs containing either /commit/ or /pull/ + if "github.com" in url and ("/commit/" in url or "/pull/" in url): + if not url.endswith(".patch"): + return f"{url}.patch" + return url + + return None + + def _find_matching_affected( + self, + osv_data: dict, + package_name: str | None = None, + ) -> dict | None: + """Find an affected block that has a GIT range with a fixed commit. + + Args: + osv_data: OSV vulnerability data + package_name: Optional package name to filter affected blocks + + Returns: + Matching affected block dict, or None if no match + """ + + + for affected in osv_data.get("affected", []): + + + for range_block in affected.get("ranges", []): + if range_block.get("type") == "GIT": + for event in range_block.get("events", []): + if "fixed" in event: + return affected + + return None + + def _extract_fix_commit(self, affected: dict) -> OSVAffectedRange: + """Extract the fixed commit hash and repo URL from an affected block. + + Args: + affected: OSV affected block + + Returns: + OSVAffectedRange with repo_url and fixed_commit + """ + result = OSVAffectedRange() + + ranges = affected.get("ranges", []) + for range_block in ranges: + if range_block.get("type") != "GIT": + continue + + repo = range_block.get("repo") + if repo: + result.repo_url = repo + + events = range_block.get("events", []) + for event in events: + if "introduced" in event and event["introduced"] != "0": + result.introduced_commit = event["introduced"] + if "fixed" in event: + result.fixed_commit = event["fixed"] + + if result.fixed_commit: + break + + return result + + def _build_patch_url(self, repo_url: str, commit_sha: str) -> str | None: + """Build GitHub patch URL from repo URL and commit SHA. + + Args: + repo_url: Git repository URL (e.g., "https://github.com/openssl/openssl") + commit_sha: Git commit hash + + Returns: + Patch URL (e.g., "https://github.com/openssl/openssl/commit/.patch"), + or None if not a GitHub repo + """ + match = _GITHUB_REPO_PATTERN.match(repo_url) + if not match: + if "github.com" in repo_url: + parts = repo_url.rstrip('/').split('/') + if len(parts) >= 2: + repo_path = '/'.join(parts[-2:]).replace('.git', '') + return f"https://github.com/{repo_path}/commit/{commit_sha}.patch" + logger.debug("Non-GitHub repo URL: %s", repo_url) + return None + + repo_path = match.group(1) + return f"https://github.com/{repo_path}/commit/{commit_sha}.patch" + + async def _fetch_github_patch(self, patch_url: str) -> str | None: + """Download patch content from GitHub. + + Args: + patch_url: URL to the .patch file + + Returns: + Patch content string, or None on failure + """ + try: + async with request_with_retry( + session=self._session, + request_kwargs={ + 'method': 'GET', + 'url': patch_url, + 'timeout': self._github_timeout, + }, + max_retries=3, + sleep_time=0.5, + log_on_error=False, + ) as response: + return await response.text() + except aiohttp.ClientResponseError as e: + if e.status == 404: + logger.info("GitHub patch not found: %s", patch_url) + else: + logger.warning("GitHub patch fetch failed: %s - %s", patch_url, e) + return None + except Exception as e: + logger.warning("GitHub patch fetch failed: %s - %s", patch_url, e) + return None diff --git a/src/vuln_analysis/utils/output_formatter.py b/src/vuln_analysis/utils/output_formatter.py index 5bbbe5e60..6d3acf591 100644 --- a/src/vuln_analysis/utils/output_formatter.py +++ b/src/vuln_analysis/utils/output_formatter.py @@ -109,7 +109,7 @@ def _add_header(markdown_content, model_dict: AgentMorpheusOutput): markdown_content[cve_id].append(f"# Vulnerability Analysis Report for {cve_id}") markdown_content[cve_id].append(f"> **Container Analyzed:** `{input_image.name}:{input_image.tag}`\n\n") # Only add SBOM info if it is a file location - if input_image.sbom_info.type == "file": + if input_image.sbom_info and input_image.sbom_info.type == "file": markdown_content[cve_id].append(f"> **SBOM Info:** `{input_image.sbom_info}`\n\n") markdown_content[cve_id].append(f"> **Status:** {_get_expoiltability_text(output.justification.status)}") diff --git a/src/vuln_analysis/utils/package_identifier.py b/src/vuln_analysis/utils/package_identifier.py new file mode 100644 index 000000000..42346d2ab --- /dev/null +++ b/src/vuln_analysis/utils/package_identifier.py @@ -0,0 +1,495 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re + +from univers import versions + +from exploit_iq_commons.data_models.checker_status import EnumIdentifyResult, PackageCheckerStatus, PackageIdentifyResult +from exploit_iq_commons.data_models.cve_intel import CveIntel +from exploit_iq_commons.logging.loggers_factory import LoggingFactory +from exploit_iq_commons.utils.string_utils import package_names_match +from exploit_iq_commons.data_models.common import TargetPackage + +logger = LoggingFactory.get_agent_logger(__name__) + +# NEVRA = Name-Epoch-Version-Release-Architecture, the standard RPM package naming format. +_RPM_NEVRA_RE = re.compile(r"^(.+?)-(?:(\d+):)?(\d\S*?)-(\S+)$") +_DIST_TAG_RE = re.compile(r"(el\d+)") +_ARCH_SUFFIXES = frozenset({"x86_64", "aarch64", "i686", "noarch", "s390x", "ppc64le", "armv7hl", "src"}) + + +def _strip_arch_suffix(release_arch: str) -> str: + """Remove .arch suffix if present, preserving dist tags like .el6_10.""" + if "." in release_arch: + base, suffix = release_arch.rsplit(".", 1) + if suffix in _ARCH_SUFFIXES: + return base + return release_arch + + +def _extract_dist_tag(release: str) -> str | None: + """Extract the RHEL dist-tag family (e.g. 'el8') from a release string.""" + m = _DIST_TAG_RE.search(release) + return m.group(1) if m else None + + +_RHEL_VERSION_RE = re.compile(r"el(\d+)") + +_AFFECTED_FIX_STATES = frozenset({ + "affected", "fix deferred", "under investigation", +}) +_NOT_AFFECTED_FIX_STATES = frozenset({ + "not affected", + "will not fix", + "out of support scope", +}) + + +def _extract_rhel_version(distro_tag: str | None) -> str | None: + """Extract RHEL major version from dist-tag (e.g., 'el7' -> '7', 'el10' -> '10').""" + if not distro_tag: + return None + m = _RHEL_VERSION_RE.match(distro_tag) + return m.group(1) if m else None + + +def _match_package_state_for_distro( + package_states: list, + target_name: str, + target_distro: str | None, +): + """Find the PackageState entry matching target package name and distro. + + Returns the matching PackageState or None if no match found. + """ + rhel_version = _extract_rhel_version(target_distro) + + for ps in package_states: + if not ps.package_name or not package_names_match(target_name, ps.package_name): + continue + if rhel_version is None: + return ps # No distro info, return first name match + # Match by CPE (e.g., "cpe:/o:redhat:enterprise_linux:7") + if ps.cpe and f":enterprise_linux:{rhel_version}" in ps.cpe: + return ps + # Match by product_name (e.g., "Red Hat Enterprise Linux 7") + if ps.product_name and re.search(rf"\b{rhel_version}\b", ps.product_name): + return ps + return None + + +def _interpret_fix_state(fix_state: str | None) -> EnumIdentifyResult | None: + """Interpret RHSA fix_state into an identification result (PackageIdentify step 2). + + Returns: + EnumIdentifyResult.NO for not affected / will not fix / out of support scope + EnumIdentifyResult.YES for affected, fix deferred, under investigation + None if fix_state is unknown or missing (fall through to NVD checks) + """ + if not fix_state: + return None + state = fix_state.lower().strip() + if state in _NOT_AFFECTED_FIX_STATES: + return EnumIdentifyResult.NO + if state in _AFFECTED_FIX_STATES: + return EnumIdentifyResult.YES + return None # Unknown state, fall through to other checks + + +class PackageIdentifier: + """ + Deterministic PackageIdentify phase: resolves package identity from intel, + cross-references the SBOM, checks version ranges, and locates RPMs in cache. + """ + + def __init__( + self, + target_package: TargetPackage, + ): + self._target_package = target_package + + + def identify(self, intel: CveIntel | None) -> tuple[PackageCheckerStatus, PackageIdentifyResult]: + """Run PackageIdentify for a single CVE. + + Step 1 — RHSA scope: target must appear in ``package_state`` or + ``affected_release`` when Red Hat published either list; else + ``PKG_IDENT_CVE_MISMATCH``. + Step 2 — Vulnerability posture: ``PKG_IDENT_NOT_VUL`` when RHSA/NVD + shows the target is not affected or already fixed. + """ + + package_identify = PackageIdentifyResult() + status = PackageCheckerStatus.OK + if intel is None: + status = PackageCheckerStatus.ERROR_PKG_IDENT_NO_INTEL + return status, package_identify + + if not self._is_cve_for_target_package(intel): + status = PackageCheckerStatus.PKG_IDENT_CVE_MISMATCH + package_identify.is_target_package_affected = EnumIdentifyResult.NO + return status, package_identify + + package_identify.is_target_package_affected = self._is_target_package_affected(intel,package_identify) + + package_identify.is_target_package_fixed = self._is_target_package_fixed(intel,package_identify) + + if package_identify.is_target_package_affected == EnumIdentifyResult.NO or package_identify.is_target_package_fixed == EnumIdentifyResult.YES: + status = PackageCheckerStatus.PKG_IDENT_NOT_VUL + + return status, package_identify + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _find_and_locate_rpm(self, intel: CveIntel) -> list[str]: + """Extract deduplicated RPM package names from RHSA package_state.""" + packages = self._extract_rhsa(intel) + packages = [p for p in packages if "/" not in p.get("package_name", "/")] + seen: set[str] = set() + names: list[str] = [] + for pkg in packages: + name = pkg.get("package_name") + if name and name not in seen: + seen.add(name) + names.append(name) + return names + + def _is_cve_for_target_package(self, intel: CveIntel) -> bool: + """Step 1: target is in scope for this CVE per Red Hat package lists. + + Returns True if RHSA has no package lists or the target appears in + ``package_state`` (any fix_state) or ``affected_release`` (patched builds). + Returns False when RHSA lists packages but the target matches neither bucket. + """ + if not intel.rhsa: + return True + + has_package_state = bool(intel.rhsa.package_state) + has_affected_release = bool( + getattr(intel.rhsa, "affected_release", None) + ) + if not has_package_state and not has_affected_release: + return True + + target_name = self._target_package.name + if has_package_state: + for ps in intel.rhsa.package_state: + if ps.package_name and package_names_match(target_name, ps.package_name): + return True + if has_affected_release and self._target_in_affected_release(intel): + return True + return False + + def _target_in_affected_release(self, intel: CveIntel) -> bool: + """True when target package name matches an RHSA affected_release NEVRA.""" + target_name = self._target_package.name + for entry in intel.rhsa.affected_release or []: + raw = entry.get("package") if isinstance(entry, dict) else getattr(entry, "package", None) + if not raw: + continue + m = _RPM_NEVRA_RE.match(raw) + if not m: + continue + name = m.group(1) + if "/" in name: + continue + if package_names_match(target_name, name): + return True + return False + + def _is_target_package_affected( + self, intel: CveIntel, package_identify: PackageIdentifyResult, + ) -> EnumIdentifyResult: + """Determine whether the target package is affected by this CVE. + + Priority 1: Check RHSA fix_state (distro-specific vendor assessment). + Priority 2: Fall back to NVD version range check. + Only returns NO with definitive proof; defaults to UNKNOWN otherwise. + """ + rpm_names = self._find_and_locate_rpm(intel) + if not rpm_names: + return EnumIdentifyResult.UNKNOWN + package_identify.affected_rpm_list = rpm_names + + target_name = self._target_package.name + target_version = self._target_package.version + target_release = self._target_package.release + target_distro = _extract_dist_tag(target_release) if target_release else None + + # Priority 1: Check RHSA fix_state (distro-specific vendor assessment) + if intel.rhsa and intel.rhsa.package_state: + matched_ps = _match_package_state_for_distro( + intel.rhsa.package_state, target_name, target_distro + ) + if matched_ps: + result = _interpret_fix_state(matched_ps.fix_state) + if result is not None: + logger.debug( + "RHSA fix_state=%s for %s on %s -> %s", + matched_ps.fix_state, target_name, target_distro, result.value + ) + if result == EnumIdentifyResult.NO: + package_identify.conclusion_reason = ( + f"RHSA fix_state indicates package is not vulnerable for analysis. " + f"Package: {target_name}, Distro: {target_distro or 'unknown'}, " + f"fix_state: '{matched_ps.fix_state}'" + ) + return result + + # Priority 2: Fall back to NVD version range check + name_matched = any(package_names_match(target_name, name) for name in rpm_names) + + if name_matched: + if target_version: + in_range = self._version_in_affected_range(target_version, intel) + if not in_range: + version_range_str = self._format_nvd_version_range(intel, target_name) + package_identify.conclusion_reason = ( + f"Target version is outside NVD affected version range. " + f"Package: {target_name}-{target_version}, " + f"Affected range: {version_range_str}" + ) + return EnumIdentifyResult.NO + return EnumIdentifyResult.YES + return EnumIdentifyResult.YES + + if target_version and intel.nvd and intel.nvd.configurations: + in_range = self._version_in_affected_range(target_version, intel) + if not in_range: + version_range_str = self._format_nvd_version_range(intel, target_name) + package_identify.conclusion_reason = ( + f"Target version is outside NVD affected version range. " + f"Package: {target_name}-{target_version}, " + f"Affected range: {version_range_str}" + ) + return EnumIdentifyResult.NO + return EnumIdentifyResult.UNKNOWN + + return EnumIdentifyResult.UNKNOWN + + def _format_nvd_version_range(self, intel: CveIntel, target_name: str) -> str: + """Format NVD version range for human-readable output.""" + if intel.nvd is None or not intel.nvd.configurations: + return "unknown" + + ranges = [] + for config in intel.nvd.configurations: + if not package_names_match(target_name, config.package): + continue + parts = [] + if config.versionStartIncluding: + parts.append(f">={config.versionStartIncluding}") + if config.versionStartExcluding: + parts.append(f">{config.versionStartExcluding}") + if config.versionEndIncluding: + parts.append(f"<={config.versionEndIncluding}") + if config.versionEndExcluding: + parts.append(f"<{config.versionEndExcluding}") + if parts: + ranges.append(" && ".join(parts)) + + return " OR ".join(ranges) if ranges else "any version (no range specified)" + + def _is_target_package_fixed(self, intel: CveIntel, package_identify: PackageIdentifyResult) -> EnumIdentifyResult: + """Determine whether the target package is already running the fixed version. + + Task 1: populate fixed_rpm_list from RHSA affected_release. + Task 2: compare target version+release against fix NVR. + """ + fix_entries = self._extract_fixed_rpms(intel) + if not fix_entries: + return EnumIdentifyResult.UNKNOWN + package_identify.fixed_rpm_list = [e["nevra"] for e in fix_entries] + + target_name = self._target_package.name + matching = [e for e in fix_entries if package_names_match(target_name, e["name"])] + if not matching: + return EnumIdentifyResult.UNKNOWN + + # NOTE: Version comparison disabled to test Option A (rely entirely on Verify phase). + # fixed_rpm_list is still populated for reference/logging. + # To re-enable, uncomment the block below. + return EnumIdentifyResult.UNKNOWN + + # --- DISABLED: Version comparison logic --- + # target_version = self._target_package.version + # target_release = self._target_package.release + # + # fix = matching[0] + # try: + # target_nvr = f"{target_version}-{target_release}" + # fix_nvr = f"{fix['version']}-{fix['release']}" + # + # target_dist = _extract_dist_tag(target_release) if target_release else None + # fix_dist = _extract_dist_tag(fix["release"]) + # if target_dist and fix_dist and target_dist != fix_dist: + # logger.debug( + # "Cross-stream fix comparison skipped: target=%s fix=%s", + # target_dist, fix_dist, + # ) + # return EnumIdentifyResult.UNKNOWN + # + # if versions.RpmVersion(target_nvr) >= versions.RpmVersion(fix_nvr): + # package_identify.conclusion_reason = ( + # f"Target package version is at or above the fix version. " + # f"Target: {target_name}-{target_nvr}, Fix: {fix_nvr}" + # ) + # return EnumIdentifyResult.YES + # return EnumIdentifyResult.NO + # except Exception as exc: + # logger.debug("Fix version comparison failed: %s", exc) + # return EnumIdentifyResult.UNKNOWN + + + def _version_in_affected_range(self, target_version: str, intel: CveIntel) -> bool: + """Check if target_version falls within any NVD configuration affected range.""" + if intel.nvd is None or not intel.nvd.configurations: + return True # no range data -> conservatively assume affected + + target_name = self._target_package.name + matched_any_config = False + for config in intel.nvd.configurations: + if not package_names_match(target_name, config.package): + continue + matched_any_config = True + version_range = [ + config.versionStartExcluding, + config.versionEndExcluding, + config.versionStartIncluding, + config.versionEndIncluding, + ] + if all(v is None for v in version_range): + continue + try: + if self._check_version_in_range(target_version, version_range): + return True + except Exception as exc: + logger.debug("Version comparison failed for %s: %s", target_version, exc) + return True # conservative: assume affected on error + + if not matched_any_config: + return True # no NVD data for this package -> conservatively assume affected + return False + + @staticmethod + def _check_version_in_range(version_to_check: str, version_range: list[str | None]) -> bool: + """Reuse the same logic as VulnerableDependencyChecker._check_version_in_range.""" + ver_start_excl, ver_end_excl, ver_start_incl, ver_end_incl = version_range + + all_versions = [v for v in version_range if v is not None] + [version_to_check] + has_el = any("el" in str(v) for v in all_versions) + has_deb = any("deb" in str(v) or "ubuntu" in str(v) for v in all_versions) + + if has_el: + vfunc = versions.RpmVersion + elif has_deb: + vfunc = versions.DebianVersion + else: + vfunc = versions.GenericVersion + + vtc = vfunc(version_to_check) + vsi = vfunc(ver_start_incl) if ver_start_incl else None + vse = vfunc(ver_start_excl) if ver_start_excl else None + vei = vfunc(ver_end_incl) if ver_end_incl else None + vee = vfunc(ver_end_excl) if ver_end_excl else None + + if vsi: + if not (vsi <= vtc): + return False + elif vse: + if not (vse < vtc): + return False + + if vei: + if not (vtc <= vei): + return False + elif vee: + if not (vtc < vee): + return False + + return True + + # ------------------------------------------------------------------ + # Intel extraction + # ------------------------------------------------------------------ + + @staticmethod + def _extract_rhsa(intel: CveIntel) -> list[dict]: + if intel.rhsa is None or not intel.rhsa.package_state: + return [] + packages = [] + for ps in intel.rhsa.package_state: + if ps.package_name: + packages.append({"package_name": ps.package_name}) + return packages + + @staticmethod + def _extract_fixed_rpms(intel: CveIntel) -> list[dict]: + """Extract all fix entries from RHSA affected_release. + + Returns a list of dicts with keys: nevra, name, version, release. + """ + if intel.rhsa is None or not hasattr(intel.rhsa, "affected_release"): + return [] + releases = intel.rhsa.affected_release + if not releases: + return [] + results: list[dict] = [] + for entry in releases: + raw = entry.get("package") if isinstance(entry, dict) else getattr(entry, "package", None) + if not raw: + continue + m = _RPM_NEVRA_RE.match(raw) + if not m: + continue + name = m.group(1) + if "/" in name: + continue + version = m.group(3) + release_arch = m.group(4) + release = _strip_arch_suffix(release_arch) + results.append({"nevra": raw, "name": name, "version": version, "release": release}) + return results + + @staticmethod + def _extract_fix_info(intel: CveIntel | None, resolved_name: str) -> dict: + """Extract fix NVR from RHSA affected_release for the resolved package. + + Returns a dict with keys nevra, name, version, release when a matching + fix entry is found; empty dict otherwise. + """ + if intel is None or intel.rhsa is None or not hasattr(intel.rhsa, "affected_release"): + return {} + releases = intel.rhsa.affected_release + if not releases: + return {} + for entry in releases: + raw = entry.get("package") if isinstance(entry, dict) else getattr(entry, "package", None) + if not raw: + continue + m = _RPM_NEVRA_RE.match(raw) + if not m: + continue + name = m.group(1) + if name.lower() != resolved_name.lower(): + continue + version = m.group(3) + release_arch = m.group(4) + release = _strip_arch_suffix(release_arch) + return {"nevra": raw, "name": name, "version": version, "release": release} + return {} + diff --git a/src/vuln_analysis/utils/rpm_checker_prompts.py b/src/vuln_analysis/utils/rpm_checker_prompts.py new file mode 100644 index 000000000..793b96692 --- /dev/null +++ b/src/vuln_analysis/utils/rpm_checker_prompts.py @@ -0,0 +1,1678 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Prompt templates for the RPM Checker L1 (Code Agent) and L2 (Build Agent) phases. + +L1 prompts handle source code verification for CVE patch status. +L2 prompts handle build-time configuration and hardening flag verification. +""" + +# =========================================================================== +# L1 CODE AGENT PROMPTS +# =========================================================================== + +# --------------------------------------------------------------------------- +# L1 Verdict Extraction +# --------------------------------------------------------------------------- + +L1_VERDICT_EXTRACTION_PROMPT = """\ +Extract the security verdict from this L1 agent investigation conclusion. + +CVE: {vuln_id} +Package: {target_package} + +L1 Agent Final Answer: +{final_answer} + +Classify the conclusion into one of these categories: +- "protected": The package is protected (patch applied, fix backported, or mitigating control present) +- "not_present": The vulnerable code/function is not present in this version +- "vulnerable": The vulnerable code is confirmed present and unpatched +- "uncertain": Insufficient evidence or conflicting findings + +Provide your confidence level (0.0-1.0) based on the strength of evidence in the answer. +""" + +# --------------------------------------------------------------------------- +# L1 Vulnerability Intel Extraction +# --------------------------------------------------------------------------- + +VULNERABILITY_INTEL_EXTRACTION_PROMPT = """\ +Extract structured vulnerability intelligence from the CVE data and patch content. +Your output will be used to guide source code searches, so focus on grep-able patterns. + + +CVE ID: {vuln_id} +Package: {target_package} +CVE Description: {cve_description} + + + +{vendor_mitigations} + + + +{patch_data} + + + +1. affected_files: Extract file paths from patch headers (strip a/ b/ prefixes) +2. vulnerable_functions: Extract function names from: + - Removed lines (- lines) in patch + - Function names mentioned in CVE description + - Functions or buffers mentioned in VENDOR_MITIGATIONS +3. vulnerable_variables: Extract variable names from: + - Removed lines that are key to the vulnerability + - Variables/buffers explicitly mentioned in VENDOR_MITIGATIONS (e.g., "sum2 buffer") +4. vulnerable_patterns: Extract distinctive code snippets from removed lines (- lines) + - Focus on patterns that can be grepped + - Include enough context to be unique +5. fix_patterns: Extract distinctive code snippets from added lines (+ lines) + - These indicate the fix is present +6. root_cause: Explain WHY the code is vulnerable in 1-2 sentences + - Incorporate insights from VENDOR_MITIGATIONS if provided +7. vulnerability_type: Classify as one of: buffer_overflow, integer_overflow, use_after_free, + null_deref, format_string, race_condition, path_traversal, injection, uninitialized_memory, other +8. search_keywords: List 3-5 grep patterns ordered by specificity: + - Start with most specific (unique variable/function names from patch or mitigations) + - End with broader patterns (file names, component names) +9. affected_bitness: Determine which bitness is affected: + - "32-bit": Look for "32-bit systems", "i386", "i686", "on 32-bit" + - "64-bit": Look for "64-bit only", "x86_64 only" (rare) + - "both": DEFAULT when not explicitly stated + NOTE: Do NOT assume bitness based on the vulnerability type. Default to "both" unless explicitly stated. +10. affected_architectures: Determine which CPU families are affected (or null for all): + - Look for: "x86", "Intel", "AMD" -> ["x86"] + - Look for: "ARM", "aarch64", "arm64" -> ["arm"] + - Look for: "PowerPC", "POWER", "ppc64" -> ["ppc"] + - Look for: "s390", "z/Architecture", "IBM Z" -> ["s390"] + - If none mentioned or "all architectures" -> null (affects all) + NOTE: Most CVEs affect all architectures. Only extract specific families if explicitly mentioned. +11. known_mitigations: Copy vendor-provided mitigations verbatim if present (e.g., compiler flags like "-ftrivial-auto-var-init=zero", configuration changes). Empty if none provided. + + + +- If no patch is provided, extract what you can from the CVE description and VENDOR_MITIGATIONS +- VENDOR_MITIGATIONS often contain specific variable names, buffers, or compiler flags that are highly relevant +- For search_keywords, prefer identifiers over natural language +- Patterns should be grep-friendly (avoid regex special chars unless escaped) + +""" + +# --------------------------------------------------------------------------- +# L1 System Prompts +# --------------------------------------------------------------------------- + +L1_AGENT_SYS_PROMPT_PATCH_AVAILABLE = ( + "You are a security analyst investigating whether a CVE fix has been applied to a package.\n" + "A downstream patch file exists and has been analyzed.\n\n" + "VULNERABILITY_INTEL contains DOWNSTREAM_PATCH_STATUS and extracted patterns from the patch.\n" + "The source code index contains the UNPATCHED tarball; the patch is applied at BUILD time.\n\n" + "YOUR TASK: Verify (1) vulnerable code exists in source, (2) fix pattern is absent.\n" + "Both outcomes are EXPECTED when DOWNSTREAM_PATCH_STATUS is APPLIED.\n\n" + "CRITICAL RULES:\n" + "- If DOWNSTREAM_PATCH_STATUS is APPLIED, the package is PATCHED (patch applied at build time).\n" + "- Finding vulnerable code in source is EXPECTED (source is unpatched tarball).\n" + "- NOT finding fix pattern in source is EXPECTED (fix is in patch file, not tarball).\n" + "- Both findings together confirm the patch will correctly fix the code at build time.\n\n" + "ANSWER QUALITY:\n" + "- Cite specific file paths and line numbers from tool results.\n" + "- Quote the actual code found, not just describe it.\n" + "- Confirm the patch addresses the vulnerable code found.\n" + "- State confidence level based on evidence quality." +) + +L1_AGENT_SYS_PROMPT_UPSTREAM_PATCH = ( + "You are a security analyst verifying that a package is VULNERABLE to a CVE.\n" + "The TARGET package does NOT contain a CVE-specific patch file.\n" + "However, patterns have been extracted from the patch in a FIXED RPM version.\n\n" + "VULNERABILITY_INTEL contains patterns extracted from the fixed version's patch.\n\n" + "YOUR TASK: Verify the TARGET package contains the vulnerable code and LACKS the fix.\n\n" + "VERIFICATION STRATEGY:\n" + "1. FIRST search for the VULNERABLE code pattern (from VULNERABLE_PATTERNS).\n" + " - Use function names, variable names, or unique code snippets.\n" + " - The vulnerable code SHOULD exist in the target package.\n" + "2. If vulnerable code is found, search for the FIX code pattern (from FIX_PATTERNS).\n" + " - The fix code should NOT exist in the target package.\n" + "3. CONCLUSION:\n" + " - If vulnerable code EXISTS and fix is ABSENT → Package is VULNERABLE.\n" + " - If fix code IS found → Package may be patched via rebase (investigate further).\n" + " - If neither is found → Use file paths from AFFECTED_FILES to locate relevant code.\n\n" + "CRITICAL RULES:\n" + "- The patch is from a FIXED version - expect the target to have vulnerable code.\n" + "- Use file paths and function names from VULNERABILITY_INTEL to locate code.\n" + "- Search for distinctive code patterns, not generic keywords.\n" + "- Base conclusions ONLY on tool results, not assumptions.\n\n" + "ANSWER QUALITY:\n" + "- Cite specific file paths and line numbers from tool results.\n" + "- Quote the actual code found, not just describe it.\n" + "- Compare found code against both vulnerable and fix patterns.\n" + "- Clearly state whether vulnerable code exists and whether fix is absent.\n" + "- State confidence level based on evidence quality." +) + +L1_AGENT_SYS_PROMPT_REBASE_FIX = ( + "You are a security analyst verifying that a CVE fix is PRESENT in a rebased package.\n" + "The TARGET package was REBASED to a newer upstream version that claims to fix this CVE.\n\n" + "VULNERABILITY_INTEL contains patterns extracted from the upstream fix.\n\n" + "YOUR TASK: Verify the TARGET package contains the FIX code (proving rebase was effective).\n\n" + "VERIFICATION STRATEGY:\n" + "1. FIRST search for the FIX code pattern (from FIX_PATTERNS).\n" + " - Use function names, variable names, or unique code snippets.\n" + " - The fix code SHOULD exist in the target package (proving rebase worked).\n" + "2. If fix code is found, optionally confirm VULNERABLE code is ABSENT.\n" + " - The vulnerable code should NOT exist (was replaced by the fix).\n" + "3. CONCLUSION:\n" + " - If fix code EXISTS → Package is PATCHED via rebase.\n" + " - If vulnerable code still EXISTS and fix is ABSENT → Rebase may be incomplete.\n" + " - If neither is found → Use file paths from AFFECTED_FILES to locate relevant code.\n\n" + "CRITICAL RULES:\n" + "- The patch is from a FIXED version - expect the target to have the fix code.\n" + "- Use file paths and function names from VULNERABILITY_INTEL to locate code.\n" + "- Search for distinctive code patterns, not generic keywords.\n" + "- Base conclusions ONLY on tool results, not assumptions.\n\n" + "ANSWER QUALITY:\n" + "- Cite specific file paths and line numbers from tool results.\n" + "- Quote the actual code found, not just describe it.\n" + "- Compare found code against both vulnerable and fix patterns.\n" + "- Clearly state whether fix code exists, confirming the rebase.\n" + "- State confidence level based on evidence quality." +) + +L1_AGENT_SYS_PROMPT_REBASE_NO_PATCH = """You are a security analyst verifying that a CVE fix is PRESENT in a rebased package. +The TARGET package was REBASED to a newer upstream version that claims to fix this CVE. +NO PATCH FILE IS AVAILABLE - you must use the CVE description to guide your search. + +YOUR TASK: Verify the TARGET package contains the fix by searching for: +1. Code patterns mentioned in the CVE description +2. Defensive code that would mitigate the vulnerability +3. Function/symbol names related to the CVE + +VERIFICATION STRATEGY (No Patch Mode): +1. EXTRACT key identifiers from the CVE description: + - Function names, API calls, variable names + - Vulnerable code constructs described + - Fixed/secure code patterns described +2. SEARCH for these patterns in the target source code +3. ANALYZE the code to determine if it shows the fix behavior +4. CONCLUDE based on presence of defensive code and absence of vulnerability indicators""" + +# --------------------------------------------------------------------------- +# L1 Prompt Templates +# --------------------------------------------------------------------------- + +L1_AGENT_PROMPT_TEMPLATE = """{sys_prompt} + + +CVE ID: {vuln_id} +Target Package: {target_package} + + + +{vulnerability_intel} + + + +{tools} + + + +{tool_selection_strategy} + + +{tool_instructions} + +RESPONSE: +{{""" + +L1_AGENT_PROMPT_TEMPLATE_NO_PATCH = """{sys_prompt} + + +CVE ID: {vuln_id} +Target Package: {target_package} + + + +{vulnerability_intel} + + + +{tools} + + +{tool_selection_strategy} + +{tool_instructions}""" + +# --------------------------------------------------------------------------- +# L1 Thought Instructions +# --------------------------------------------------------------------------- + +L1_AGENT_THOUGHT_INSTRUCTIONS = """ +You will receive KNOWLEDGE (cumulative findings) and LATEST FINDINGS (most recent tool results). +BEFORE ACTING, you MUST: +1. Review KNOWLEDGE to see what tools were already called (TOOL_CALL_RECORD entries) +2. Review LATEST FINDINGS for the most recent tool output analysis +3. NEVER repeat any action already in TOOL_CALL_RECORD +4. Your next action MUST build on findings - progress the investigation + + + +PHASE 0 - CHECK PATCH STATUS (PRIORITY): + FIRST check VULNERABILITY_INTEL for DOWNSTREAM_PATCH_STATUS. + If DOWNSTREAM_PATCH_STATUS is APPLIED: + - The source code index contains the UNPATCHED tarball + - The patch file is applied at BUILD time, not in the indexed source + - Do 2 verification searches, then FINISH with verdict PATCHED + +PHASE 1 - INTELLIGENCE (PRE-COMPLETED): + Review VULNERABILITY_INTEL above. It contains: + - DOWNSTREAM_PATCH_STATUS: APPLIED means package is patched at build time + - PATCH_FILE: Name of the patch file + - AFFECTED_FILES: Files to verify + - VULNERABLE_FUNCTIONS: Functions to search for + - VULNERABLE_PATTERNS: Code patterns indicating vulnerability + - FIX_PATTERNS: Code patterns indicating the fix (will be ABSENT in source) + +PHASE 2 - SOURCE CODE INSPECTION (when DOWNSTREAM_PATCH_STATUS is APPLIED): + Do exactly 2 verification searches: + 1. Search for vulnerable function/pattern → should FIND it (source is unpatched) + 2. Search for fix pattern → should NOT find it (fix is in separate patch file) + Both outcomes are EXPECTED and confirm the patch is correct. + After both searches, FINISH immediately with PATCHED verdict. + +PHASE 3 - VERDICT: + If DOWNSTREAM_PATCH_STATUS is APPLIED: + - Found vulnerable code + fix absent = PATCHED (patch will fix it at build time) + - This is the EXPECTED outcome, not a failure + Conclude after 2 searches - do NOT keep searching. + + + +1. You MUST select a tool ONLY from . Do NOT invent or use any other tool names. +2. Output valid JSON only. thought < 100 words. final_answer < 150 words. +3. mode="act" REQUIRES actions. mode="finish" REQUIRES final_answer. +4. If DOWNSTREAM_PATCH_STATUS is APPLIED, do max 2 searches then conclude PATCHED. +5. Do NOT call the same tool with the same input twice - CHECK KNOWLEDGE for prior calls. +6. When patch is APPLIED: finding vulnerable code = GOOD, not finding fix = GOOD (expected). +7. If a pattern contains special regex characters, escape them or use literal substrings. + + + +If DOWNSTREAM_PATCH_STATUS is APPLIED: +- Search 1: Find vulnerable function → EXPECTED to find (source is unpatched) +- Search 2: Check fix pattern → EXPECTED to NOT find (fix is in patch file) +- After both: FINISH with PATCHED verdict +If a search returned results: +- If vulnerable code found and patch is APPLIED, proceed to verify fix is absent +- After both checks complete, FINISH +If a pattern wasn't found: +- Try simpler substrings or partial patterns +- Try a different tool (Source Grep <-> Code Keyword Search) + + + +{{"thought": "DOWNSTREAM_PATCH_STATUS is APPLIED. Search for vulnerable function first", "mode": "act", "actions": {{"tool": "Source Grep", "query": "parse_rockridge", "reason": "Verify vulnerable function exists in unpatched source"}}, "final_answer": null}} + + +{{"thought": "Found vulnerable function. Now verify fix pattern is absent (expected since fix is in patch file)", "mode": "act", "actions": {{"tool": "Source Grep", "query": "if (file->pz_log2_bs < 15", "reason": "Confirm fix pattern is absent from source"}}, "final_answer": null}} + + +{{"thought": "Vulnerable code found, fix absent as expected. DOWNSTREAM_PATCH_STATUS is APPLIED so package is PATCHED.", "mode": "finish", "actions": null, "final_answer": "The package is PATCHED. Found vulnerable function at file.c:123. Fix pattern absent from source (expected - fix is in patch file applied at build time). DOWNSTREAM_PATCH_STATUS confirms patch is applied."}} + + +{{"thought": "KNOWLEDGE has sufficient evidence: vulnerable code at X, fix absent", "mode": "finish", "actions": null, "final_answer": "The package is [PATCHED/VULNERABLE]. Found [evidence] at [file:line]. The code [matches/differs from] the patch because [reason]."}} +""" + +L1_AGENT_THOUGHT_UPSTREAM_INSTRUCTIONS = """ +You will receive KNOWLEDGE (cumulative findings) and LATEST FINDINGS (most recent tool results). +BEFORE ACTING, you MUST: +1. Review KNOWLEDGE to see what tools were already called (TOOL_CALL_RECORD entries) +2. Review LATEST FINDINGS for the most recent tool output analysis +3. NEVER repeat any action already in TOOL_CALL_RECORD +4. Your next action MUST build on findings - progress the investigation + + + +PHASE 1 - INTELLIGENCE (PRE-COMPLETED): + Review VULNERABILITY_INTEL above. It contains: + - AFFECTED_FILES: Files to verify + - VULNERABLE_FUNCTIONS: Functions to search for + - VULNERABLE_PATTERNS: Code patterns indicating vulnerability + - FIX_PATTERNS: Code patterns indicating the fix + - SEARCH_KEYWORDS: Terms to grep for + +PHASE 2 - SOURCE CODE INSPECTION (YOUR TASK): + For EACH item in VULNERABLE_FUNCTIONS and AFFECTED_FILES: + 1. Search for vulnerable pattern - it SHOULD exist in unpatched target + 2. Search for fix pattern - it should NOT exist in unpatched target + IMPORTANT: Do NOT stop after finding the first file. Check ALL AFFECTED_FILES. + +PHASE 3 - VERDICT: + Only conclude when: + - ALL AFFECTED_FILES have been searched + - ALL VULNERABLE_FUNCTIONS have been located + - Evidence is sufficient for confident verdict + + + +1. You MUST select a tool ONLY from . Do NOT invent or use any other tool names. +2. Output valid JSON only. thought < 100 words. final_answer < 150 words. +3. mode="act" REQUIRES actions. mode="finish" REQUIRES final_answer. +4. Source Grep: use query field with pattern from VULNERABILITY_INTEL (function name, variable, or code snippet). +5. Code Keyword Search: use query field for broader searches. +6. Do NOT call the same tool with the same input twice - CHECK KNOWLEDGE for prior calls. +7. FIRST search for VULNERABLE code - it SHOULD exist in target. +8. THEN search for FIX code - it should NOT exist in target. +9. If a pattern contains special regex characters, escape them or use literal substrings. +10. Before PATCHED: verify FIX_APPLIED_AT_CALL_SITE in ALL AFFECTED_FILES. FIX_DEFINITION_FOUND alone is insufficient. +11. Fix must be CALLED and result USED (assigned/in condition). Called but unused = not applied. + + + +If a search returned results: +- Narrow down by searching within that specific file (e.g., "pattern,filename.c") +- Search for related symbols or variables from the code found +If a pattern wasn't found: +- Try simpler substrings or partial patterns +- Try a different tool (Source Grep <-> Code Keyword Search) +- Search for file paths from VULNERABILITY_INTEL AFFECTED_FILES +If KNOWLEDGE shows partial evidence: +- Investigate other files mentioned in VULNERABILITY_INTEL AFFECTED_FILES +- Search for key variables from the fix pattern +If FIX_DEFINITION_FOUND: search AFFECTED_FILES for actual usage before concluding PATCHED. + + + +{{"thought": "No prior searches in KNOWLEDGE. Search for the vulnerable code pattern from the patch", "mode": "act", "actions": {{"tool": "Source Grep", "query": "", "reason": "Locate vulnerable code that should exist in unpatched target"}}, "final_answer": null}} + + +{{"thought": "KNOWLEDGE shows function found at iso9660.c:2074. Now verify the fix is NOT present", "mode": "act", "actions": {{"tool": "Source Grep", "query": "", "reason": "Check if fix code is absent (confirms vulnerability)"}}, "final_answer": null}} + + +{{"thought": "KNOWLEDGE shows fix pattern not found but need more evidence. Search for key variable in the found file", "mode": "act", "actions": {{"tool": "Source Grep", "query": ",", "reason": "Examine how the vulnerable variable is handled"}}, "final_answer": null}} + + +{{"thought": "KNOWLEDGE shows Source Grep failed. Try Code Keyword Search for the file from patch", "mode": "act", "actions": {{"tool": "Code Keyword Search", "query": "", "reason": "Verify we are looking at the correct file"}}, "final_answer": null}} + + +{{"thought": "Vulnerable code found, fix absent", "mode": "finish", "actions": null, "final_answer": "VULNERABLE. Found vulnerable code at [file:line]. Fix not present."}} + + +{{"thought": "FIX_APPLIED_AT_CALL_SITE found despite no CVE patch", "mode": "finish", "actions": null, "final_answer": "PATCHED via rebase. Fix applied at [file:line]. Included via upstream update."}} + + +{{"thought": "FIX_DEFINITION_FOUND but no call site evidence", "mode": "finish", "actions": null, "final_answer": "UNCERTAIN - fix function exists at [file:line] but usage in AFFECTED_FILES unverified. Manual review required."}} +""" + +L1_AGENT_THOUGHT_REBASE_INSTRUCTIONS = """ +You will receive KNOWLEDGE (cumulative findings) and LATEST FINDINGS (most recent tool results). +BEFORE ACTING, you MUST: +1. Review KNOWLEDGE to see what tools were already called (TOOL_CALL_RECORD entries) +2. Review LATEST FINDINGS for the most recent tool output analysis +3. NEVER repeat any action already in TOOL_CALL_RECORD +4. Your next action MUST build on findings - progress the investigation + + + +PHASE 1 - INTELLIGENCE (PRE-COMPLETED): + Review VULNERABILITY_INTEL above. It contains: + - AFFECTED_FILES: Files to verify + - VULNERABLE_FUNCTIONS: Functions to search for + - VULNERABLE_PATTERNS: Code patterns indicating vulnerability + - FIX_PATTERNS: Code patterns indicating the fix + - SEARCH_KEYWORDS: Terms to grep for + +PHASE 2 - SOURCE CODE INSPECTION (YOUR TASK): + For EACH item in VULNERABLE_FUNCTIONS and AFFECTED_FILES: + 1. Search for fix pattern - it SHOULD exist in rebased target + 2. Verify vulnerable pattern is ABSENT from target + IMPORTANT: Do NOT stop after finding the first file. Check ALL AFFECTED_FILES. + +PHASE 3 - VERDICT: + Only conclude when: + - ALL AFFECTED_FILES have been searched + - ALL VULNERABLE_FUNCTIONS have been located + - Evidence is sufficient for confident verdict + + + +1. You MUST select a tool ONLY from . Do NOT invent or use any other tool names. +2. Output valid JSON only. thought < 100 words. final_answer < 150 words. +3. mode="act" REQUIRES actions. mode="finish" REQUIRES final_answer. +4. Source Grep: use query field with pattern from VULNERABILITY_INTEL (function name, variable, or code snippet). +5. Code Keyword Search: use query field for broader searches. +6. Do NOT call the same tool with the same input twice - CHECK KNOWLEDGE for prior calls. +7. FIRST search for FIX code - it SHOULD exist in rebased target. +8. THEN verify VULNERABLE code is ABSENT from target. +9. If a pattern contains special regex characters, escape them or use literal substrings. +10. Before PATCHED: verify FIX_APPLIED_AT_CALL_SITE in ALL AFFECTED_FILES. FIX_DEFINITION_FOUND alone is insufficient. +11. Fix must be CALLED and result USED (assigned/in condition). Called but unused = not applied. + + + +If a search returned results: +- Narrow down by searching within that specific file (e.g., "pattern,filename.c") +- Search for related symbols or variables from the code found +If a pattern wasn't found: +- Try simpler substrings or partial patterns +- Try a different tool (Source Grep <-> Code Keyword Search) +- Search for file paths from VULNERABILITY_INTEL AFFECTED_FILES +If KNOWLEDGE shows partial evidence: +- Investigate other files mentioned in VULNERABILITY_INTEL AFFECTED_FILES +- Search for key variables from the fix pattern +If FIX_DEFINITION_FOUND: search AFFECTED_FILES for actual usage before concluding PATCHED. + + + +{{"thought": "No prior searches in KNOWLEDGE. Search for the fix code pattern from the patch", "mode": "act", "actions": {{"tool": "Source Grep", "query": "", "reason": "Locate fix code that should exist after rebase"}}, "final_answer": null}} + + +{{"thought": "KNOWLEDGE shows fix pattern not found. Try searching for key variable from the fix", "mode": "act", "actions": {{"tool": "Source Grep", "query": "", "reason": "Find how the fix-related variable is handled"}}, "final_answer": null}} + + +{{"thought": "KNOWLEDGE shows variable found at file.c:100. Search for the full fix pattern in that file", "mode": "act", "actions": {{"tool": "Source Grep", "query": ",file.c", "reason": "Check if fix exists in the located file"}}, "final_answer": null}} + + +{{"thought": "KNOWLEDGE shows fix confirmed. Now verify the vulnerable code is absent", "mode": "act", "actions": {{"tool": "Source Grep", "query": "", "reason": "Check if vulnerable code was removed (confirms fix)"}}, "final_answer": null}} + + +{{"thought": "FIX_APPLIED_AT_CALL_SITE confirmed, vulnerable code absent", "mode": "finish", "actions": null, "final_answer": "PATCHED via rebase. Fix at [file:line]. Vulnerable code absent."}} + + +{{"thought": "Vulnerable code still present despite rebase", "mode": "finish", "actions": null, "final_answer": "INCOMPLETE rebase. Vulnerable code at [file:line]. Manual review required."}} + + +{{"thought": "FIX_DEFINITION_FOUND but no call site evidence", "mode": "finish", "actions": null, "final_answer": "UNCERTAIN - fix function exists at [file:line] but usage in AFFECTED_FILES unverified. Manual review required."}} +""" + +L1_AGENT_THOUGHT_CVE_DESC_INSTRUCTIONS = """ +You will receive KNOWLEDGE (cumulative findings) and LATEST FINDINGS (most recent tool results). +BEFORE ACTING, you MUST: +1. Review KNOWLEDGE to see what tools were already called (TOOL_CALL_RECORD entries) +2. Review LATEST FINDINGS for the most recent tool output analysis +3. NEVER repeat any action already in TOOL_CALL_RECORD +4. Your next action MUST build on findings - progress the investigation + + + +PHASE 1 - INTELLIGENCE (PRE-COMPLETED): + Review VULNERABILITY_INTEL above. It contains: + - AFFECTED_FILES: Files to verify (may be inferred from CVE description) + - VULNERABLE_FUNCTIONS: Functions to search for + - VULNERABLE_PATTERNS: Code patterns indicating vulnerability + - FIX_PATTERNS: Code patterns indicating the fix + - SEARCH_KEYWORDS: Terms to grep for + - ROOT_CAUSE: Description of the vulnerability mechanism + +PHASE 2 - SOURCE CODE INSPECTION (YOUR TASK): + For EACH item in VULNERABLE_FUNCTIONS and SEARCH_KEYWORDS: + 1. Search for vulnerable code patterns + 2. Search for defensive/fix patterns (bounds checks, validation, etc.) + IMPORTANT: Do NOT stop after finding the first file. Check ALL potential locations. + +PHASE 3 - VERDICT: + Only conclude when: + - Key files have been searched + - Vulnerable functions have been located + - Evidence is sufficient for confident verdict + + +RESPONSE FORMAT (JSON): +You must respond with a JSON object with these fields: +- thought: Your reasoning based on KNOWLEDGE and VULNERABILITY_INTEL (reference what was already found) +- mode: "act" (to use a tool) or "finish" (to provide final answer) +- actions: (only if mode="act") {{"tool": "Tool Name", "query": "search term", "reason": "why this search"}} +- final_answer: (only if mode="finish") Your conclusion about patch status + + +1. Do NOT call the same tool with the same input twice - CHECK KNOWLEDGE for prior calls. +2. If KNOWLEDGE shows a search was done, your next action must be DIFFERENT. +3. Output valid JSON only. thought < 100 words. + + + +If a search returned results: +- Narrow down by searching within that specific file (e.g., "pattern,filename.c") +- Search for related symbols or defensive patterns in the found code +If a pattern wasn't found: +- Try simpler substrings or partial patterns +- Try a different tool (Source Grep <-> Code Keyword Search) +- Search for SEARCH_KEYWORDS from VULNERABILITY_INTEL + + + +{{"thought": "No prior searches in KNOWLEDGE. Search for key function from CVE description", "mode": "act", "actions": {{"tool": "Source Grep", "query": "", "reason": "Locate code related to CVE vulnerability"}}, "final_answer": null}} + + +{{"thought": "KNOWLEDGE shows defensive code found, no vulnerability indicators", "mode": "finish", "actions": null, "final_answer": "The package is LIKELY PATCHED. Found defensive code at [file:line]: [quote code]. The fix behavior described in the CVE appears to be present."}} + + +{{"thought": "KNOWLEDGE shows insufficient evidence to determine patch status", "mode": "finish", "actions": null, "final_answer": "INCONCLUSIVE. Could not find definitive evidence of fix or vulnerability. Manual review recommended."}} +""" + +# --------------------------------------------------------------------------- +# L1 Observation Prompts (Comprehension + Memory Update) +# --------------------------------------------------------------------------- + +L1_COMPREHENSION_PROMPT = """Analyze the tool output and extract key findings for CVE patch verification. +GOAL: Verify whether {vuln_id} fix is applied to {target_package} + +**CRITICAL FIRST CHECK - DO THIS BEFORE ANYTHING ELSE:** +Examine NEW OUTPUT below. If it is EMPTY, contains only whitespace, or shows an error: +- findings MUST be: ["FAILED: {tool_used} '{tool_input}' returned empty/no matches"] +- tool_outcome MUST be: "{tool_used} [{tool_input}] -> NO MATCHES" +- DO NOT fabricate, infer, or assume any results. STOP HERE. + + +{vulnerability_intel} + + + +{raw_patch_diff} + + + +If RAW_PATCH_DIFF contains content: +- Lines starting with "-" are REMOVED by the fix (vulnerable code) +- Lines starting with "+" are ADDED by the fix (patched code) +- CRITICAL: A pattern appearing in BOTH "-" and "+" lines is NOT a unique indicator +- Only patterns EXCLUSIVELY in "-" lines indicate vulnerable code +- Only patterns EXCLUSIVELY in "+" lines indicate the fix is present +- Compare grep results against these exact lines, not the summarized patterns above + +If RAW_PATCH_DIFF is empty: +- Fall back to using VULNERABLE_PATTERNS and FIX_PATTERNS from VULNERABILITY_INTEL + + +TOOL USED: {tool_used} +TOOL INPUT: {tool_input} +THOUGHT: {last_thought} +NEW OUTPUT: +{tool_output} + +**ANTI-HALLUCINATION RULES:** +1. You can ONLY report findings based on text that ACTUALLY APPEARS in NEW OUTPUT above. +2. Every finding claiming code was "found" MUST include a direct quote from NEW OUTPUT. +3. If NEW OUTPUT is empty, you CANNOT claim any code was found - report FAILED. +4. The tool_outcome MUST accurately reflect what NEW OUTPUT shows, not what you expect. + +CODE ANALYSIS RULES (only if NEW OUTPUT has content): +1. READ the actual code snippets in NEW OUTPUT. Compare against VULNERABLE_PATTERNS and FIX_PATTERNS. +2. For each match found: + - Quote the actual line from NEW OUTPUT + - State the file:line where it was found + - Determine if it matches VULNERABLE or FIX pattern +3. If RAW_PATCH_DIFF is available: + - Check if the grep match appears in "-" lines (removed/vulnerable) + - Check if the grep match appears in "+" lines (added/fix) + - Report accordingly: matches "-" lines = VULNERABLE_CODE_FOUND + - A match that appears in BOTH "-" and "+" lines is NOT a unique indicator - note this +4. RECORD file paths and line numbers for all relevant matches. +5. FIX LOCATION: Tag FIX_DEFINITION_FOUND if found outside AFFECTED_FILES or is a function definition. Tag FIX_APPLIED_AT_CALL_SITE only when fix is CALLED and result USED in an AFFECTED_FILE. + + +Based on VULNERABILITY_INTEL above, assess investigation completeness: +- Have you searched in ALL files listed in AFFECTED_FILES? +- Have you found ALL instances of VULNERABLE_FUNCTIONS? +- Are there OTHER files containing the same vulnerable pattern? +If coverage is incomplete, note which files/functions remain unchecked. + + +OUTPUT RULES: +- findings: 2-4 observations. Each positive finding MUST quote actual content from NEW OUTPUT. +- tool_outcome: "{tool_used} [pattern] -> found in file.c:123" OR "{tool_used} [pattern] -> NO MATCHES" +RESPONSE: +{{""" + +L1_MEMORY_UPDATE_PROMPT = """Merge new findings into the CVE patch investigation memory. +GOAL: Verify whether {vuln_id} fix is applied to {target_package} +PREVIOUS MEMORY: {previous_memory} +NEW FINDINGS (from tool analysis): +{findings} +TOOL CALL RECORD: {tool_outcome} + +**CRITICAL: HANDLE FAILURES CORRECTLY** +If NEW FINDINGS contains "FAILED:" or TOOL CALL RECORD shows "NO MATCHES": +- Add the failure/no-match to memory verbatim +- Do NOT convert a failed search into a positive finding +- "NO MATCHES" for a fix pattern means FIX_CODE_ABSENT, not FIX_CODE_FOUND + +MEMORY RULES: +1. Start from PREVIOUS MEMORY. Append new facts from NEW FINDINGS. No duplicates. +2. Add TOOL CALL RECORD verbatim so future steps know what was already searched. +3. If NEW FINDINGS report a failure or no matches, record it as-is. Do NOT infer positive findings. + +PATCH VERIFICATION TRACKING: +- Vulnerable code FOUND: add "VULNERABLE_CODE_FOUND: [pattern] in [file:line]" +- Fix at definition only: add "FIX_DEFINITION_FOUND: [pattern] in [file:line]" +- Fix called+used in AFFECTED_FILE: add "FIX_APPLIED_AT_CALL_SITE: [pattern] in [file:line]" +- No vulnerable code matches: add "VULNERABLE_CODE_ABSENT: [pattern] not found" +- No fix code matches: add "FIX_CODE_ABSENT: [pattern] not found" + +VERDICT EVIDENCE: +- PATCHED: FIX_APPLIED_AT_CALL_SITE in AFFECTED_FILES or vulnerable code absent +- VULNERABLE: vulnerable code found, fix absent from call sites +- UNCERTAIN: FIX_DEFINITION_FOUND only, no call site evidence +- INCONCLUSIVE: neither pattern found, or conflicting evidence + +- results: copy the NEW FINDINGS as-is. +- memory: updated cumulative findings with search results and evidence tags. +RESPONSE: +{{""" + +# --------------------------------------------------------------------------- +# L1 Observation Prompts (CVE-Description Mode - No Patch Available) +# --------------------------------------------------------------------------- + +L1_COMPREHENSION_PROMPT_CVE_DESC = """Analyze the tool output for CVE patch verification using CVE description context. +GOAL: Verify whether {vuln_id} fix is applied to {target_package} + +**CRITICAL FIRST CHECK - DO THIS BEFORE ANYTHING ELSE:** +Examine NEW OUTPUT below. If it is EMPTY, contains only whitespace, or shows an error: +- findings MUST be: ["FAILED: {tool_used} '{tool_input}' returned empty/no matches"] +- tool_outcome MUST be: "{tool_used} [{tool_input}] -> NO MATCHES" +- DO NOT fabricate, infer, or assume any results. STOP HERE. + +CVE DESCRIPTION: +{cve_description} + +SPEC CHANGELOG (rebase info): +{spec_log_change} + +NOTE: No patch file available. Extract search terms from CVE description. + +TOOL USED: {tool_used} +TOOL INPUT: {tool_input} +THOUGHT: {last_thought} +NEW OUTPUT: +{tool_output} + +**ANTI-HALLUCINATION RULES:** +1. You can ONLY report findings based on text that ACTUALLY APPEARS in NEW OUTPUT above. +2. Every finding claiming code was "found" MUST include a direct quote from NEW OUTPUT. +3. If NEW OUTPUT is empty, you CANNOT claim any code was found - report FAILED. +4. The tool_outcome MUST accurately reflect what NEW OUTPUT shows, not what you expect. + +CODE ANALYSIS RULES (only if NEW OUTPUT has content): +1. EXTRACT key identifiers from the CVE description: + - Function names, variable names, API calls + - File paths or component names mentioned + +2. For each code match in NEW OUTPUT: + - Quote the actual line from NEW OUTPUT + - Does it relate to the vulnerability described? + - Does it show defensive patterns (bounds checking, null validation)? + - Record file path and line number as evidence + +3. DEFENSIVE PATTERNS indicating a fix: + - Input validation, bounds checking, null guards + - Resource cleanup, error handling + +OUTPUT: +- findings: 2-4 observations. Each positive finding MUST quote actual content from NEW OUTPUT. +- tool_outcome: "{tool_used} [pattern] -> found in file.c:123" OR "{tool_used} [pattern] -> NO MATCHES" +RESPONSE: +{{""" + +L1_MEMORY_UPDATE_PROMPT_CVE_DESC = """Merge findings into CVE patch investigation memory. +GOAL: Verify whether {vuln_id} fix is applied to {target_package} +MODE: CVE-description based (no patch patterns) + +PREVIOUS MEMORY: {previous_memory} +NEW FINDINGS: {findings} +TOOL CALL RECORD: {tool_outcome} + +MEMORY RULES: +1. Append new facts from NEW FINDINGS to PREVIOUS MEMORY. No duplicates. +2. Add TOOL CALL RECORD verbatim. + +CVE-BASED TRACKING: +- CVE-related code FOUND: "CVE_CODE_FOUND: [symbol] in [file:line]" +- Defensive pattern FOUND: "DEFENSIVE_CODE_FOUND: [pattern] in [file:line]" +- Search no match: "SEARCH_NO_MATCH: [pattern]" + +VERDICT (CVE-description mode): +- LIKELY_PATCHED: defensive code found, no vulnerability indicators +- LIKELY_VULNERABLE: vulnerability patterns found, no defensive code +- INCONCLUSIVE: insufficient evidence + +- results: copy the NEW FINDINGS as-is. +- memory: updated cumulative findings with evidence tags. +RESPONSE: +{{""" + + +# =========================================================================== +# L2 BUILD AGENT PROMPTS +# =========================================================================== + +# --------------------------------------------------------------------------- +# L2 Configuration Investigation Prompts +# --------------------------------------------------------------------------- + +L2_CONFIG_SYS_PROMPT = ( + "You are an L2 Build Agent investigating whether VULNERABLE CODE is DISABLED at build time.\n\n" + "GOAL: Determine if the CVE-affected feature/component is compiled into the binary.\n\n" + "EVIDENCE SOURCES:\n" + "1. BUILD_HARVEST section below - disabled/enabled features ALREADY extracted (analyze in thought)\n" + "2. Build log (searchable with 'logs:' prefix) - verify affected source files were compiled\n\n" + "INVESTIGATION FLOW:\n" + "1. Analyze BUILD_HARVEST in your thought (no tool needed - just read and decide)\n" + "2. If feature IS DISABLED -> mode='finish' with NOT_COMPILED verdict immediately\n" + "3. If feature IS ENABLED -> mode='act' with 'logs:' prefix to verify affected files in build log\n" + "4. If feature NOT listed -> mode='act' with 'logs:' prefix to search build log for compilation evidence\n\n" + "VERDICTS:\n" + "- NOT_COMPILED: Feature is disabled OR affected files not in build log\n" + "- COMPILED: Feature is enabled AND affected files are compiled\n" + "- UNKNOWN: Cannot determine from available evidence" +) + +L2_CONFIG_PROMPT_TEMPLATE = """{sys_prompt} + + +CVE ID: {vuln_id} +Target Package: {target_package} + + + +{vulnerability_intel} + +L1 Preliminary Verdict: {l1_preliminary_verdict} + + + +** CHECK THESE FIRST - No tool call needed! ** + +Disabled Features (from build log -D defines): +{disabled_features} + +Disabled Features (from spec configure flags): +{spec_disabled_features} + +Enabled Features (from build log -D defines): +{enabled_features} + +Enabled Features (from spec configure flags): +{spec_enabled_features} + +Linked Libraries (from build -l flags): +{linked_libraries} + +Built Subpackages (from spec %package): +{built_subpackages} + +Excluded Subpackages (from spec %bcond_without, ExcludeArch): +{excluded_subpackages} + +Kernel Config (CONFIG_*=y/n/m, kernel packages only): +{kernel_config} + +DECISION GUIDE: +- If CVE-affected feature in DISABLED lists -> verdict NOT_COMPILED (no tool needed) +- If CVE-affected feature in ENABLED lists -> strong signal for COMPILED (verify affected files in build log) +- If CVE-affected module in EXCLUDED subpackages -> verdict NOT_COMPILED +- If CVE-affected library NOT in linked_libraries -> check for alternative backend +- For kernel CVEs: if CONFIG_X=n in kernel_config -> NOT_COMPILED +- If lists are empty or feature not listed -> search build log with 'logs:' prefix + +IMPORTANT - Library vs Capability: +- Disabled features often name a specific LIBRARY, not the capability itself +- Example: 'ssl' disabled = OpenSSL library disabled, NOT TLS capability +- Always check if an ALTERNATIVE library for the same capability is ENABLED +- If alternative enabled, the capability IS compiled (just different implementation) +- Common patterns: ssl->nss/gnutls (TLS), zlib->zstd (compression), libidn->libidn2 (IDN) +- Check linked_libraries for actual library linkage evidence + + + +{tools} + + +{tool_instructions} + +RESPONSE: +{{""" + +L2_CONFIG_THOUGHT_INSTRUCTIONS = """ +You will receive KNOWLEDGE (cumulative findings) and LATEST FINDINGS (most recent tool results). +BEFORE ACTING, you MUST: +1. Review KNOWLEDGE to see what tools were already called (TOOL_CALL_RECORD entries) +2. Review LATEST FINDINGS for the most recent tool output analysis +3. NEVER repeat any action already in TOOL_CALL_RECORD +4. Your next action MUST build on findings - progress the investigation + + + +1. You MUST select a tool ONLY from . Do NOT invent tool names. +2. Output valid JSON only. thought < 100 words. final_answer < 150 words. +3. mode="act" REQUIRES actions with a tool. mode="finish" REQUIRES final_answer. NEVER use mode="act" with null actions. +4. Analyze BUILD_HARVEST in your thought, then immediately decide: finish (if disabled) or act with tool (to verify). +5. If feature in DISABLED lists -> check if alternative library for same capability is ENABLED first. +6. If DISABLED library has alternative ENABLED (e.g., ssl disabled but nss enabled) -> capability IS compiled. +7. If feature in ENABLED lists -> strong signal for COMPILED, verify affected files in build log. +8. If feature NOT in BUILD_HARVEST, search build log using 'logs:' prefix (e.g., 'logs:filename.c'). +9. NEVER grep source code - use 'logs:' prefix to search build logs for compilation evidence. +10. Do NOT call the same tool with the same input twice. + + + +{{"thought": "CVE affects SM2 crypto. BUILD_HARVEST shows 'sm2' in spec_disabled_features. SM2 is disabled at build time.", "mode": "finish", "actions": null, "final_answer": "NOT_COMPILED. The spec file configures with 'no-sm2' flag, which disables SM2 cryptographic functions. The vulnerable code in crypto/sm2/ is not compiled into the binary."}} + + + +{{"thought": "CVE affects zisofs. BUILD_HARVEST disabled features are empty - zisofs not disabled. Need to verify affected file was compiled.", "mode": "act", "actions": {{"tool": "Source Grep", "query": "logs:archive_read_support_format_iso9660", "reason": "Check if affected file appears in build compilation log"}}, "final_answer": null}} + + + +{{"thought": "CVE affects LDAP. BUILD_HARVEST shows 'ldap' in spec_enabled_features. LDAP explicitly enabled - strong signal feature is compiled. Verifying affected files in build log.", "mode": "act", "actions": {{"tool": "Source Grep", "query": "logs:ldap.c", "reason": "Verify LDAP-related files appear in build compilation log"}}, "final_answer": null}} + + + +{{"thought": "CVE affects TLS. 'ssl' in DISABLED but 'nss' in ENABLED. ssl=OpenSSL library, nss=NSS library - both provide TLS. Alternative library enabled, so TLS capability IS compiled. Verifying TLS files.", "mode": "act", "actions": {{"tool": "Source Grep", "query": "logs:vtls", "reason": "Verify TLS abstraction layer files compiled in build log"}}, "final_answer": null}} + + + +{{"thought": "Feature not disabled. Now verify affected file crypto/sm2/sm2.c was compiled.", "mode": "act", "actions": {{"tool": "Source Grep", "query": "logs:sm2.c", "reason": "Check if affected source file appears in build log"}}, "final_answer": null}} + + + +{{"thought": "Found no-sm2 in spec_disabled_features. SM2 code is not compiled.", "mode": "finish", "actions": null, "final_answer": "NOT_COMPILED. The spec file configures with 'no-sm2' flag, which disables SM2 cryptographic functions. The vulnerable code in crypto/sm2/ is not compiled into the binary."}} + + + +{{"thought": "SM2 not disabled. Found sm2.c compilation in build log.", "mode": "finish", "actions": null, "final_answer": "COMPILED. SM2 is not in disabled features. Build log shows 'gcc -c crypto/sm2/sm2_crypt.c -o sm2_crypt.o', confirming the vulnerable code is compiled into the binary."}} + + + +{{"thought": "Cannot find evidence either way. Affected files not in build log but feature not disabled.", "mode": "finish", "actions": null, "final_answer": "UNKNOWN. The feature is not explicitly disabled, but the affected files do not appear in the build log. Cannot determine compilation status."}} +""" + +# --------------------------------------------------------------------------- +# L2 Configuration Investigation Prompts - SPEC-ONLY MODE (no build log) +# --------------------------------------------------------------------------- + +L2_CONFIG_SPEC_ONLY_SYS_PROMPT = ( + "You are an L2 Build Agent investigating whether VULNERABLE CODE is DISABLED at build time.\n\n" + "GOAL: Determine if the CVE-affected feature/component is compiled into the binary.\n\n" + "** SPEC-ONLY MODE - No build log available **\n" + "Confidence is LOWER because we cannot verify actual compilation output.\n" + "Conclusions are based on BUILD INTENT from spec configure flags, not actual build output.\n\n" + "EVIDENCE SOURCES:\n" + "1. BUILD_HARVEST section below - disabled/enabled features from spec %build section\n" + "2. Source/build system files (CMakeLists.txt, Makefile.am, configure.ac) - searchable with Source Grep\n\n" + "INVESTIGATION FLOW:\n" + "1. Analyze BUILD_HARVEST in your thought (no tool needed - just read and decide)\n" + "2. If feature IS DISABLED -> mode='finish' with NOT_COMPILED verdict immediately\n" + "3. If feature IS ENABLED -> mode='finish' with COMPILED verdict (lower confidence)\n" + "4. If feature NOT listed -> mode='act' to search build system files (CMakeLists.txt, Makefile.am)\n\n" + "VERDICTS:\n" + "- NOT_COMPILED: Feature explicitly disabled in spec configure flags\n" + "- COMPILED: Feature not disabled (lower confidence - based on spec, not build output)\n" + "- UNKNOWN: Cannot determine from available evidence" +) + +L2_CONFIG_PROMPT_SPEC_ONLY_TEMPLATE = """{sys_prompt} + + +CVE ID: {vuln_id} +Target Package: {target_package} + + + +{vulnerability_intel} + +L1 Preliminary Verdict: {l1_preliminary_verdict} + + + +** SPEC-ONLY MODE - No build log available ** +Confidence: Lower - conclusions based on spec configure flags only + +Disabled Features (from spec configure flags): +{spec_disabled_features} + +Enabled Features (from spec configure flags): +{spec_enabled_features} + +Built Subpackages (from spec %package): +{built_subpackages} + +Excluded Subpackages (from spec %bcond_without, ExcludeArch): +{excluded_subpackages} + +Kernel Config (CONFIG_*=y/n/m, kernel packages only): +{kernel_config} + +DECISION GUIDE (spec-only): +- If CVE-affected feature in DISABLED list -> verdict NOT_COMPILED +- If CVE-affected feature in ENABLED list -> strong signal for COMPILED +- If CVE-affected module in EXCLUDED subpackages -> verdict NOT_COMPILED +- For kernel CVEs: if CONFIG_X=n in kernel_config -> NOT_COMPILED +- If feature not listed -> search build system files (CMakeLists.txt, Makefile.am, configure.ac) +- Cannot verify actual compilation - verdicts are based on BUILD INTENT, not build output + +IMPORTANT - Library vs Capability: +- Disabled features often name a specific LIBRARY, not the capability itself +- Example: 'ssl' disabled = OpenSSL library disabled, NOT TLS capability +- Always check if an ALTERNATIVE library for the same capability is ENABLED +- If alternative enabled, the capability IS compiled (just different implementation) +- Common patterns: ssl->nss/gnutls (TLS), zlib->zstd (compression), libidn->libidn2 (IDN) + + + +{tools} + + +{tool_instructions} + +RESPONSE: +{{""" + +L2_CONFIG_SPEC_ONLY_THOUGHT_INSTRUCTIONS = """ +You will receive KNOWLEDGE (cumulative findings) and LATEST FINDINGS (most recent tool results). +BEFORE ACTING, you MUST: +1. Review KNOWLEDGE to see what tools were already called (TOOL_CALL_RECORD entries) +2. Review LATEST FINDINGS for the most recent tool output analysis +3. NEVER repeat any action already in TOOL_CALL_RECORD +4. Your next action MUST build on findings - progress the investigation + + + +1. You MUST select a tool ONLY from . Do NOT invent tool names. +2. Output valid JSON only. thought < 100 words. final_answer < 150 words. +3. mode="act" REQUIRES actions with a tool. mode="finish" REQUIRES final_answer. NEVER use mode="act" with null actions. +4. Analyze BUILD_HARVEST in your thought, then immediately decide: finish (if disabled) or act with tool (to verify). +5. If feature in DISABLED lists -> check if alternative library for same capability is ENABLED first. +6. If DISABLED library has alternative ENABLED (e.g., ssl disabled but nss enabled) -> capability IS compiled. +7. If feature in ENABLED lists -> strong signal for COMPILED. Can finish or search build system files for confirmation. +8. If feature NOT in BUILD_HARVEST, search build system files (CMakeLists.txt, Makefile.am, configure.ac). +9. Use Source Grep to search source code and build system files. Do NOT use 'logs:' prefix (no build log available). +10. Do NOT call the same tool with the same input twice. + + + +{{"thought": "CVE affects SM2 crypto. BUILD_HARVEST shows 'sm2' in spec_disabled_features. SM2 is disabled at build time.", "mode": "finish", "actions": null, "final_answer": "NOT_COMPILED. The spec file configures with 'no-sm2' flag, which disables SM2 cryptographic functions. The vulnerable code in crypto/sm2/ is not compiled into the binary."}} + + + +{{"thought": "CVE affects zisofs. BUILD_HARVEST disabled features are empty - zisofs not disabled. Searching build system for zisofs configuration.", "mode": "act", "actions": {{"tool": "Source Grep", "query": "zisofs", "reason": "Search CMakeLists.txt or Makefile.am for zisofs feature configuration"}}, "final_answer": null}} + + + +{{"thought": "CVE affects LDAP. BUILD_HARVEST shows 'ldap' in spec_enabled_features. LDAP explicitly enabled in spec - feature is compiled (lower confidence, spec-only mode).", "mode": "finish", "actions": null, "final_answer": "COMPILED. The spec file explicitly enables LDAP with '--with-ldap' configure flag. Based on spec configure flags (no build log available), the vulnerable code is likely compiled into the binary."}} + + + +{{"thought": "CVE affects TLS. 'ssl' in DISABLED but 'nss' in ENABLED. ssl=OpenSSL library, nss=NSS library - both provide TLS. Alternative library enabled, so TLS capability IS compiled.", "mode": "finish", "actions": null, "final_answer": "COMPILED. While OpenSSL (ssl) is disabled, NSS is enabled as an alternative TLS library. The TLS capability is compiled with a different implementation."}} + + + +{{"thought": "Feature not in spec flags. Searching CMakeLists.txt for feature configuration.", "mode": "act", "actions": {{"tool": "Source Grep", "query": "option.*SM2|SM2.*OFF|SM2.*ON", "reason": "Search CMakeLists.txt for SM2 feature toggle"}}, "final_answer": null}} + + + +{{"thought": "Found no-sm2 in spec_disabled_features. SM2 code is not compiled.", "mode": "finish", "actions": null, "final_answer": "NOT_COMPILED. The spec file configures with 'no-sm2' flag, which disables SM2 cryptographic functions. The vulnerable code in crypto/sm2/ is not compiled into the binary."}} + + + +{{"thought": "SM2 not disabled in spec. No evidence of disabling in build system files either.", "mode": "finish", "actions": null, "final_answer": "COMPILED. SM2 is not in disabled features in the spec file. No build system configuration disables it. Based on spec configure flags (no build log available), the vulnerable code is likely compiled into the binary."}} + + + +{{"thought": "Cannot find evidence either way. Feature not mentioned in spec flags or build system files.", "mode": "finish", "actions": null, "final_answer": "UNKNOWN. The feature is not explicitly configured in the spec file or build system files. Cannot determine compilation status without build log evidence."}} +""" + +# --------------------------------------------------------------------------- +# L2 Hardening Investigation Prompts +# --------------------------------------------------------------------------- + +L2_HARDENING_SYS_PROMPT = ( + "You are an L2 Build Agent investigating COMPILER HARDENING mitigations.\n\n" + "GOAL: Determine if hardening flags relevant to this CVE's vulnerability class are present.\n\n" + "CONTEXT: Investigation 1 determined the vulnerable code IS compiled. Now check if\n" + "compiler/linker hardening makes exploitation significantly harder.\n\n" + "CRITICAL - CWE-SPECIFIC MATCHING:\n" + "- ONLY flags listed in EXPECTED_HARDENING can justify a MITIGATED verdict\n" + "- General hardening flags (stack protector, FORTIFY_SOURCE) do NOT mitigate all CWEs\n" + "- Example: -fstack-protector helps CWE-121 (stack overflow), NOT CWE-190 (integer overflow)\n" + "- You MUST match the EXACT flags from EXPECTED_HARDENING table to the build output\n\n" + "EVIDENCE SOURCES:\n" + "1. EXPECTED_HARDENING table (CWE-specific flags from knowledge base) - THIS IS YOUR CHECKLIST\n" + "2. Build log (searchable with 'logs:' prefix) - contains CFLAGS/CXXFLAGS/LDFLAGS definitions\n\n" + "EFFICIENT SEARCH STRATEGY:\n" + "- Search 'logs:FLAGS=' to get ALL compiler/linker flags in ONE call (matches CFLAGS=, LDFLAGS=, etc.)\n" + "- Grep supports regex OR: 'logs:CFLAGS\\|LDFLAGS' combines patterns\n" + "- Analyze the output to check for expected hardening flags - avoid multiple individual searches\n\n" + "IMPORTANT - RHEL/Fedora Specs Files:\n" + "When you see these specs files in build logs, hardening flags are IMPLICITLY enabled:\n" + "- '-specs=/usr/lib/rpm/redhat/redhat-hardened-cc1' => -fPIE (position-independent code for ASLR)\n" + "- '-specs=/usr/lib/rpm/redhat/redhat-hardened-ld' => -pie + -z now (PIE linking + BIND_NOW/Full RELRO)\n" + "These flags will NOT appear explicitly in the build log - the specs file injects them.\n" + "If you see these specs files, count the corresponding protections as PRESENT.\n\n" + "INVESTIGATION STEPS:\n" + "1. Review EXPECTED_HARDENING table - these are the ONLY flags that matter for this CWE\n" + "2. Search 'logs:FLAGS=' to get all compiler/linker flag definitions at once\n" + "3. For EACH flag in EXPECTED_HARDENING, check if present in build output\n" + "4. Verdict based ONLY on EXPECTED_HARDENING flags (ignore unrelated hardening)\n\n" + "VERDICTS:\n" + "- MITIGATED: One or more flags from EXPECTED_HARDENING are present in build\n" + "- NOT_MITIGATED: NONE of the EXPECTED_HARDENING flags are present (even if other hardening exists)\n" + "- UNKNOWN: Cannot determine from available evidence" +) + +L2_HARDENING_PROMPT_TEMPLATE = """{sys_prompt} + + +CVE ID: {vuln_id} +Target Package: {target_package} +CWE: {cwe_id} + + + +The following compiler/linker flags mitigate this vulnerability class: + +{expected_hardening_table} + + + +{tools} + + +{tool_instructions} + +RESPONSE: +{{""" + +L2_HARDENING_THOUGHT_INSTRUCTIONS = """ +You will receive KNOWLEDGE (cumulative findings) and LATEST FINDINGS (most recent tool results). +BEFORE ACTING, you MUST: +1. Review KNOWLEDGE to see what tools were already called (TOOL_CALL_RECORD entries) +2. Review LATEST FINDINGS for the most recent tool output analysis +3. NEVER repeat any action already in TOOL_CALL_RECORD +4. Your next action MUST build on findings - progress the investigation + + + +1. You MUST select a tool ONLY from . Do NOT invent tool names. +2. Output valid JSON only. thought < 100 words. final_answer < 150 words. +3. mode="act" REQUIRES actions. mode="finish" REQUIRES final_answer. +4. EFFICIENT SEARCH: First search for 'logs:FLAGS=' to get ALL compiler/linker flags in one call. +5. Grep supports regex OR: use 'logs:CFLAGS\\|LDFLAGS' to combine patterns. +6. Analyze the FLAGS output to check for expected hardening flags - avoid multiple individual searches. +7. Do NOT call the same tool with the same input twice. +8. STRICT MATCHING: Only flags from EXPECTED_HARDENING justify MITIGATED verdict. +9. IGNORE UNRELATED HARDENING: -fstack-protector, -D_FORTIFY_SOURCE do NOT mitigate all CWEs. Check the table! + + + +{{"thought": "Need to check hardening flags for this CWE. Search for all compiler/linker flag definitions at once.", "mode": "act", "actions": {{"tool": "Source Grep", "query": "logs:FLAGS=", "reason": "Get all CFLAGS/CXXFLAGS/LDFLAGS definitions in one search"}}, "final_answer": null}} + + + +{{"thought": "CWE-121 stack overflow. EXPECTED_HARDENING lists -fstack-protector-strong. Found -fstack-protector-strong in CFLAGS. This matches EXPECTED_HARDENING.", "mode": "finish", "actions": null, "final_answer": "MITIGATED. Found -fstack-protector-strong which is listed in EXPECTED_HARDENING for CWE-121. Stack buffer overflows will be detected at runtime."}} + + + +{{"thought": "CWE-190 integer overflow. EXPECTED_HARDENING lists -ftrapv, -fsanitize=signed-integer-overflow. Found -fstack-protector-strong and -D_FORTIFY_SOURCE=2, but these are for memory safety NOT integer overflow. Checking for expected flags: -ftrapv MISSING, -fsanitize=signed-integer-overflow MISSING.", "mode": "finish", "actions": null, "final_answer": "NOT_MITIGATED. Build has general hardening (-fstack-protector-strong, -D_FORTIFY_SOURCE=2) but NONE of the CWE-190 specific flags from EXPECTED_HARDENING (-ftrapv, -fsanitize=signed-integer-overflow) are present. Integer overflow is not mitigated."}} + + + +{{"thought": "CWE-693 control flow. EXPECTED_HARDENING lists PIE. Found -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 which implies -fPIE. This matches EXPECTED_HARDENING.", "mode": "finish", "actions": null, "final_answer": "MITIGATED. Build uses RHEL hardened specs files which implicitly enable PIE (listed in EXPECTED_HARDENING for CWE-693). Control flow exploitation is significantly harder."}} + + + +{{"thought": "CWE-190 integer overflow. EXPECTED_HARDENING lists -ftrapv, -fsanitize=*-integer-overflow. None found in FLAGS output.", "mode": "finish", "actions": null, "final_answer": "NOT_MITIGATED. None of the EXPECTED_HARDENING flags for CWE-190 (-ftrapv, -fsanitize=signed-integer-overflow, -fsanitize=unsigned-integer-overflow) are present in the build."}} + + + +{{"thought": "Build log does not contain FLAGS= definitions. Cannot determine if EXPECTED_HARDENING flags are present.", "mode": "finish", "actions": null, "final_answer": "UNKNOWN. Build log does not contain CFLAGS/LDFLAGS definitions. Cannot determine if EXPECTED_HARDENING mitigations are present."}} +""" + +# --------------------------------------------------------------------------- +# L2 Kernel-Specific Configuration Prompts +# --------------------------------------------------------------------------- + +L2_KERNEL_CONFIG_SYS_PROMPT = ( + "You are an L2 Build Agent investigating whether VULNERABLE CODE is COMPILED in a KERNEL package.\n\n" + "KERNEL BUILD CHARACTERISTICS:\n" + "- Kernel builds use 'make -s' (silent) - individual .c file compilation is NOT visible in build logs\n" + "- Compilation is controlled by CONFIG_* options in kernel config files (kernel-{arch}-rhel.config)\n" + "- CONFIG_X=y means built-in (COMPILED), CONFIG_X=m means module (COMPILED), CONFIG_X=n means NOT COMPILED\n" + "- Source-to-module mapping requires parsing Makefile to find obj-$(CONFIG_X) directives\n\n" + "GOAL: Determine if the CVE-affected kernel code is compiled by checking CONFIG_* settings.\n\n" + "INVESTIGATION STRATEGY:\n" + "1. Identify the affected source file(s) from VULNERABILITY_INTEL (e.g., net/netfilter/nf_tables_api.c)\n" + "2. Read the Makefile in the same directory to find which CONFIG_* controls compilation\n" + " - Look for patterns like: nf_tables-objs := ... nf_tables_api.o\n" + " - Then find: obj-$(CONFIG_NF_TABLES) += nf_tables.o\n" + "3. Grep the kernel config file for that CONFIG symbol to get its value (y/m/n)\n" + "4. Determine compilation status based on CONFIG value\n\n" + "VERDICTS:\n" + "- NOT_COMPILED: CONFIG_X=n or CONFIG symbol not defined (feature disabled)\n" + "- COMPILED: CONFIG_X=y (built-in) or CONFIG_X=m (loadable module)\n" + "- UNKNOWN: Cannot determine CONFIG symbol or config file unavailable" +) + +L2_KERNEL_CONFIG_PROMPT_TEMPLATE = """{sys_prompt} + + +CVE ID: {vuln_id} +Target Package: {target_package} + + + +{vulnerability_intel} + +L1 Preliminary Verdict: {l1_preliminary_verdict} + + + +Target Architecture Config File: {kernel_config_path} +Kernel Source Root: {kernel_source_root} + +IMPORTANT: Different architectures (x86_64, aarch64, s390x, ppc64le) have DIFFERENT config files. +A CONFIG option may be enabled on x86_64 but disabled on s390x. +Always search the TARGET ARCHITECTURE config file specified above. + +INVESTIGATION STEPS: +1. From VULNERABILITY_INTEL, identify affected source file(s) + Example: net/netfilter/nf_tables_api.c + +2. Grep the Makefile in the same directory as the affected .c file to find the CONFIG symbol + Source Grep format is pattern[,file_glob] — it searches FILE CONTENTS, not file paths. + WRONG: "drivers/nvme/host/Makefile" (searches for that path string inside files → no matches) + RIGHT: "nvme-tcp,drivers/nvme/host/Makefile" (search for "nvme-tcp" only in that Makefile) + Look for: nf_tables-objs := ... nf_tables_api.o OR nvme-tcp-y += tcp.o + obj-$(CONFIG_NF_TABLES) += nf_tables.o OR obj-$(CONFIG_NVME_TCP) += nvme-tcp.o + → This tells you which CONFIG_* controls the affected .c file + +3. Grep the TARGET ARCHITECTURE kernel config file for that CONFIG symbol + Use the basename from "Target Architecture Config File" above (e.g. kernel-x86_64-rhel.config). + Example: "CONFIG_NF_TABLES=,kernel-x86_64-rhel.config" + Result: CONFIG_NF_TABLES=m + +4. Interpret the result: + - CONFIG_X=y → Built into kernel (COMPILED) + - CONFIG_X=m → Built as module (COMPILED) + - CONFIG_X=n or "# CONFIG_X is not set" → NOT_COMPILED + - Symbol not found → Likely NOT_COMPILED (feature not configured) + + + +{tools} + + +{tool_instructions} + +RESPONSE: +{{""" + +L2_KERNEL_THOUGHT_INSTRUCTIONS = """ +You will receive KNOWLEDGE (cumulative findings) and LATEST FINDINGS (most recent tool results). +BEFORE ACTING, you MUST: +1. Review KNOWLEDGE to see what tools were already called (TOOL_CALL_RECORD entries) +2. Review LATEST FINDINGS for the most recent tool output analysis +3. NEVER repeat any action already in TOOL_CALL_RECORD +4. Your next action MUST build on findings - progress the investigation + + + +Source Grep query: [target:]pattern[,file_glob] +- pattern = text to find INSIDE files (function name, CONFIG_*, module name like nvme-tcp) +- file_glob = optional path/filename filter after a comma (Makefile, *.c, kernel-x86_64-rhel.config) +- NEVER pass a bare file path as the pattern (e.g. "drivers/nvme/host/Makefile" will NOT open that file) +- Use specific patterns, not short strings like "tcp" (matches unrelated files like kernel.spec / mptcp) + + + +1. You MUST select a tool ONLY from . Do NOT invent tool names. +2. Output valid JSON only. thought < 100 words. final_answer < 150 words. +3. mode="act" REQUIRES actions with a tool. mode="finish" REQUIRES final_answer. +4. For kernel packages: FIRST grep the directory Makefile for CONFIG/obj-* lines, THEN grep the arch config file. +5. Makefile step: use pattern[,relative/path/Makefile] — e.g. "nf_tables,net/netfilter/Makefile" or "nvme-tcp,drivers/nvme/host/Makefile". +6. Config step: use CONFIG_SYMBOL=,config-basename — e.g. "CONFIG_NVME_TCP=,kernel-x86_64-rhel.config" (basename only, not full path). +7. Do NOT search build logs for .c compilation - kernel uses make -s (silent). +8. Do NOT call the same tool with the same input twice. + + + +{{"thought": "CVE affects net/netfilter/nf_tables_api.c. Grep the directory Makefile for nf_tables / CONFIG lines.", "mode": "act", "actions": {{"tool": "Source Grep", "query": "nf_tables,net/netfilter/Makefile", "reason": "Find obj-$(CONFIG_*) controlling nf_tables in that Makefile only"}}, "final_answer": null}} + + + +{{"thought": "CVE affects drivers/nvme/host/tcp.c. Grep the host Makefile for nvme-tcp and CONFIG_NVME_TCP.", "mode": "act", "actions": {{"tool": "Source Grep", "query": "nvme-tcp,drivers/nvme/host/Makefile", "reason": "Find obj-$(CONFIG_NVME_TCP) and nvme-tcp-y lines linking tcp.c to a CONFIG symbol"}}, "final_answer": null}} + + + +{{"thought": "Makefile shows obj-$(CONFIG_NF_TABLES) += nf_tables.o. Grep arch config for CONFIG_NF_TABLES value.", "mode": "act", "actions": {{"tool": "Source Grep", "query": "CONFIG_NF_TABLES=,kernel-x86_64-rhel.config", "reason": "Check if CONFIG_NF_TABLES is y/m (compiled) or n (not compiled)"}}, "final_answer": null}} + + + +{{"thought": "Makefile shows obj-$(CONFIG_NVME_TCP) += nvme-tcp.o and nvme-tcp-y += tcp.o. Check CONFIG_NVME_TCP in arch config.", "mode": "act", "actions": {{"tool": "Source Grep", "query": "CONFIG_NVME_TCP=,kernel-x86_64-rhel.config", "reason": "Confirm CONFIG_NVME_TCP=y/m/n for compilation verdict"}}, "final_answer": null}} + + + +{{"thought": "CONFIG_NF_TABLES=m found in kernel config. Module is compiled.", "mode": "finish", "actions": null, "final_answer": "COMPILED. CONFIG_NF_TABLES=m in kernel config means nf_tables is built as a loadable module. The vulnerable code in net/netfilter/nf_tables_api.c IS compiled into the kernel."}} + + + +{{"thought": "CONFIG_NF_TABLES=y found in kernel config. Built-in to kernel.", "mode": "finish", "actions": null, "final_answer": "COMPILED. CONFIG_NF_TABLES=y in kernel config means nf_tables is built into the kernel. The vulnerable code in net/netfilter/nf_tables_api.c IS compiled into the kernel."}} + + + +{{"thought": "CONFIG_NF_TABLES is not set (n) in kernel config. Feature disabled.", "mode": "finish", "actions": null, "final_answer": "NOT_COMPILED. CONFIG_NF_TABLES=n (or not set) in kernel config means nf_tables is disabled. The vulnerable code in net/netfilter/nf_tables_api.c is NOT compiled into the kernel."}} + + + +{{"thought": "Could not find CONFIG symbol in Makefile. Cannot determine compilation status.", "mode": "finish", "actions": null, "final_answer": "UNKNOWN. Could not determine which CONFIG symbol controls the affected code. Manual review of kernel configuration required."}} +""" + +# --------------------------------------------------------------------------- +# L2 Verdict Extraction Prompts +# --------------------------------------------------------------------------- + +L2_COMPILATION_VERDICT_PROMPT = """Extract the compilation verdict from the L2 Configuration investigation. + + +{final_answer} + + +Extract: +1. compilation_status: "compiled", "not_compiled", or "unknown" +2. confidence: 0.0 to 1.0 based on evidence strength +3. reasoning: Brief explanation (1-2 sentences) + +Output JSON only: +{{"compilation_status": "...", "confidence": 0.X, "reasoning": "..."}}""" + +L2_HARDENING_VERDICT_PROMPT = """Extract the hardening verdict from the L2 Hardening investigation. + + +{final_answer} + + +Extract: +1. hardening_status: "mitigated", "not_mitigated", "not_applicable", or "unknown" + - "not_applicable": This CWE class has no compiler-level mitigations available +2. hardening_flags: List of specific compiler/linker flags that provide protection (e.g., ["-fstack-protector-strong", "-D_FORTIFY_SOURCE=2", "RELRO", "PIE"]) + - Extract the actual flag names mentioned in the investigation + - Empty list if no relevant flags found +3. confidence: 0.0 to 1.0 based on evidence strength +4. reasoning: Brief explanation (1-2 sentences) + +Output JSON only: +{{"hardening_status": "...", "hardening_flags": ["..."], "confidence": 0.X, "reasoning": "..."}}""" + +# --------------------------------------------------------------------------- +# L2 Observation Prompts (Comprehension + Memory Update) +# --------------------------------------------------------------------------- + +L2_COMPREHENSION_PROMPT = """Analyze the tool output for L2 build/compilation verification. +GOAL: Determine whether {vuln_id} vulnerable code is COMPILED in {target_package} + + +{vulnerability_intel} + + + +Disabled Features (build log): {disabled_features} +Disabled Features (spec file): {spec_disabled_features} +Enabled Features (build log): {enabled_features} +Enabled Features (spec file): {spec_enabled_features} + + +TOOL USED: {tool_used} +TOOL INPUT: {tool_input} +THOUGHT: {last_thought} +NEW OUTPUT: +{tool_output} + +BUILD ANALYSIS RULES: +1. CHECK if tool output shows: + - Compilation commands for AFFECTED_FILES (e.g., gcc -c file.c -o file.o) + - Feature-disable flags that match the CVE-affected component + - Object files or compilation artifacts for VULNERABLE_FUNCTIONS + +2. COMPILATION EVIDENCE: + - COMPILED: Found gcc/compile commands for affected files + - NOT_COMPILED: Feature is disabled OR affected files not in build + - UNKNOWN: Insufficient evidence + +3. RECORD specific file paths, compile commands, or flag matches. + +TOOL-SPECIFIC RULES: +- If NEW OUTPUT is empty or error: "FAILED: [tool] [input] - [reason]" +- Source Grep: Check if matches show compilation or feature disabling +- Build log search: Look for compile commands and disabled features + +OUTPUT: +- findings: 2-4 key observations about compilation status +- tool_outcome: "Source Grep [pattern] -> found in build.log:123" +RESPONSE: +{{""" + +L2_MEMORY_UPDATE_PROMPT = """Merge findings into L2 build investigation memory. +GOAL: Determine whether {vuln_id} vulnerable code is COMPILED in {target_package} + +PREVIOUS MEMORY: {previous_memory} +NEW FINDINGS: {findings} +TOOL CALL RECORD: {tool_outcome} + +MEMORY RULES: +1. Append NEW FINDINGS to PREVIOUS MEMORY. No duplicates. +2. Add TOOL CALL RECORD verbatim. +3. If NEW FINDINGS report a failure, add the failure to memory. + +COMPILATION TRACKING: +- Affected file COMPILED: "FILE_COMPILED: [file] - evidence: [compile command]" +- Affected file NOT_COMPILED: "FILE_NOT_COMPILED: [file] - evidence: [disabled feature]" +- Feature DISABLED: "FEATURE_DISABLED: [feature] in [build_log/spec]" +- Feature ENABLED: "FEATURE_ENABLED: [feature] - no disable flag found" + +VERDICT EVIDENCE: +- NOT_COMPILED evidence: feature disabled OR affected files not compiled +- COMPILED evidence: affected files appear in compile commands +- UNKNOWN: conflicting evidence or no compilation info found + +- results: copy the NEW FINDINGS as-is. +- memory: updated cumulative findings with evidence tags. +RESPONSE: +{{""" + +# --------------------------------------------------------------------------- +# L2 Hardening Observation Prompts (Comprehension + Memory Update) +# --------------------------------------------------------------------------- + +L2_HARDENING_COMPREHENSION_PROMPT = """Analyze the tool output for L2 hardening flag verification. +GOAL: Determine whether {vuln_id} has HARDENING mitigations in {target_package} + + +CWE: {cwe_id} +Expected Hardening Flags: +{expected_hardening} + + +TOOL USED: {tool_used} +TOOL INPUT: {tool_input} +THOUGHT: {last_thought} +NEW OUTPUT: +{tool_output} + +HARDENING ANALYSIS RULES: +1. FIRST CHECK Expected Hardening Flags above: + - If "None" or empty: This CWE has NO known compiler-level mitigations + - Mark findings as "NO_RELEVANT_HARDENING: {cwe_id} has no compiler mitigations" + - Skip searching for generic flags - they won't help this vulnerability class + +2. IF expected hardening flags exist, CHECK tool output for: + - Compiler hardening flags (e.g., -fstack-protector, -fPIE, -fstack-clash-protection) + - Preprocessor defines (e.g., -D_FORTIFY_SOURCE=2, -D_GLIBCXX_ASSERTIONS) + - Linker hardening flags (e.g., -Wl,-z,relro, -Wl,-z,now) + +3. HARDENING EVIDENCE: + - FLAG_PRESENT: Found expected hardening flag in build commands + - FLAG_ABSENT: Searched but did not find expected flag + - NOT_APPLICABLE: No compiler mitigations exist for this CWE class + - UNKNOWN: Insufficient evidence + +4. RECORD specific flags found and their context (compilation line). + +TOOL-SPECIFIC RULES: +- If NEW OUTPUT is empty or error: "FAILED: [tool] [input] - [reason]" +- Source Grep: Check if matches show hardening flags in gcc/clang commands +- Build log search: Look for -f*, -D*, -Wl,* patterns + +OUTPUT: +- findings: 2-4 key observations about hardening flags +- tool_outcome: "Source Grep [pattern] -> found in build.log:123" +RESPONSE: +{{""" + +L2_HARDENING_MEMORY_UPDATE_PROMPT = """Merge findings into L2 hardening investigation memory. +GOAL: Determine whether {vuln_id} has HARDENING mitigations in {target_package} + +CWE: {cwe_id} +PREVIOUS MEMORY: {previous_memory} +NEW FINDINGS: {findings} +TOOL CALL RECORD: {tool_outcome} + +MEMORY RULES: +1. Append NEW FINDINGS to PREVIOUS MEMORY. No duplicates. +2. Add TOOL CALL RECORD verbatim. +3. If NEW FINDINGS report a failure, add the failure to memory. + +HARDENING TRACKING: +- No relevant hardening: "NO_RELEVANT_HARDENING: [CWE] has no compiler mitigations" +- Flag FOUND: "HARDENING_PRESENT: [flag] - evidence: [build command excerpt]" +- Flag NOT FOUND: "HARDENING_ABSENT: [flag] - searched but not found" +- Critical mitigation: "CRITICAL_MITIGATION: [flag] for [CWE] - [present/absent]" + +VERDICT EVIDENCE: +- NOT_APPLICABLE evidence: this CWE class has no compiler-level mitigations +- MITIGATED evidence: key hardening flags present that reduce exploitability +- NOT_MITIGATED evidence: expected hardening flags absent +- UNKNOWN: build log incomplete or no compilation commands found + +- results: copy the NEW FINDINGS as-is. +- memory: updated cumulative findings with hardening evidence tags. +RESPONSE: +{{""" + + +# =========================================================================== +# FINAL REPORT PROMPTS +# =========================================================================== + +CODE_AGENT_REPORT_PROMPT = """\ + +You are a security analyst generating the final checker investigation report. +Synthesize the results from the target package analysis, additional intel (target rebase check + reference package), code agent analysis (source code), +and optionally build agent analysis (compilation, linking, hardening) into a comprehensive, auditable report with a clear +justification and supporting evidence. + + + +CVE: {vuln_id} +Target Package: {target_package} +CVE Description: {cve_description} + + +{policy_context_section} + +## Target Package Analysis +(Checked the target package being scanned for CVE-specific patch files) +{downstream_section} + +## Additional Intel (Target Rebase + Reference Package) +This section contains TWO distinct checks: +- TARGET REBASE CHECK: Searched the target package's spec file for CVE mentions indicating a rebase fix +- REFERENCE PACKAGE: Downloaded a known-fixed package version from intel to extract patch patterns +{upstream_section} + +## Code Agent Analysis +{l1_agent_section} + + +{build_agent_status_section} +{l2_context_section} +{override_notice_section} + +Generate a structured report following these requirements: + +1. JUSTIFICATION LABEL: + + FIRST: Check BUILD_AGENT_STATUS above. + - If status is "ran_with_override": You MUST use the DECISION TREE below to select the label. + The code agent may report "vulnerable" in source, but the build agent determines final exploitability. + IGNORE the code agent's "vulnerable" verdict when an override is present. + - If status is "ran_no_override": Use code agent verdict with build context. + - If status is "not_run": Use CODE AGENT PRECEDENCE RULES at the end of this section. + + LABEL SELECTION DECISION TREE (MANDATORY when BUILD_AGENT_STATUS is "ran_with_override"): + +-----------------------------------------------------------------------------+ + | Step 1: Is compilation_status == "not_compiled"? | + | YES -> STOP. Label = "code_not_present" | + | NO -> Continue to Step 2 | + +-----------------------------------------------------------------------------+ + | Step 2: Does build agent evidence mention "architecture mismatch" or | + | architecture-specific condition (32-bit/64-bit, endianness)? | + | YES -> STOP. Label = "requires_environment" | + | NO -> Continue to Step 3 | + +-----------------------------------------------------------------------------+ + | Step 3: Is build agent verdict "vulnerable_mitigated" with hardening flags? | + | YES -> STOP. Label = "protected_by_compiler" | + | NO -> Continue to Step 4 | + +-----------------------------------------------------------------------------+ + | Step 4: Is build agent verdict "not_vulnerable" for other reasons? | + | YES -> Label = "code_not_reachable" | + | NO -> Use code-agent-based rules below | + +-----------------------------------------------------------------------------+ + + CRITICAL: When the decision tree applies: + - compilation_status="not_compiled" ALWAYS means "code_not_present" + - Do NOT output "vulnerable" if build agent has an override verdict + - The build agent verdict supersedes code agent source analysis + + AVAILABLE LABELS: + - code_not_present: Vulnerable code/function is NOT COMPILED into the binary + (e.g., disabled via configure flags like --no-ssl, --without-feature, or + conditionally excluded at build time) + - code_not_reachable: Code IS compiled but cannot be reached/executed in this context + - protected_by_mitigating_control: Downstream patch or backport mitigates the vulnerability + - protected_by_compiler: Compiler hardening flags (FORTIFY_SOURCE, stack protector, etc.) + mitigate the vulnerability + - requires_environment: Vulnerability requires specific RUNTIME/ARCHITECTURE conditions + not present (e.g., 32-bit integer overflow impossible on 64-bit, big-endian specific + bug on little-endian system). NOT for build-time configuration. + - vulnerable: Package is actually vulnerable and needs patching + - uncertain: Insufficient information to determine exploitability + + When Override verdict is "none", the build agent ran but did not change the code agent + label—still apply compilation/hardening facts in the executive summary. + + Do NOT state "vulnerable" if build agent evidence contradicts it. Instead explain: + "While source contains vulnerable patterns, the build is not affected due to [build agent reason]." + + EXAMPLES (build agent scenarios): + + Example A - SSL disabled via configure: + Build agent verdict: not_vulnerable + compilation_status: not_compiled + Evidence: "configured with --no-ssl flag" + -> CORRECT: "code_not_present" (Step 1: not_compiled -> code_not_present) + -> WRONG: "vulnerable" (build agent override supersedes code agent) + -> WRONG: "requires_environment" (this is build config, not runtime/arch) + + Example B - 64-bit architecture prevents overflow: + Build agent verdict: not_vulnerable + compilation_status: compiled + Evidence: "Architecture mismatch - 32-bit overflow impossible on 64-bit" + -> CORRECT: "requires_environment" (Step 2: architecture condition) + -> WRONG: "code_not_present" (code IS compiled, just not exploitable) + + Example C - Hardening flags mitigate: + Build agent verdict: vulnerable_mitigated + compilation_status: compiled + Evidence: "FORTIFY_SOURCE=2 prevents buffer overflow exploitation" + -> CORRECT: "protected_by_compiler" (Step 3: hardening mitigation) + + CODE AGENT PRECEDENCE RULES (ONLY when BUILD_AGENT_STATUS is "not_run"): + WARNING: Skip this section if BUILD_AGENT_STATUS contains "ran". + - If a CVE-specific patch file exists AND is applied in build, use "protected_by_mitigating_control". + - If the code agent found the fix code in source, use "protected_by_mitigating_control". + - If the code agent found vulnerable code pattern still present, use "vulnerable". + - If upstream shows rebase fixed the issue, use "protected_by_mitigating_control". + - Only use "uncertain" when evidence is conflicting or insufficient. + +2. EVIDENCE CHAIN: + - Start with target package patch availability + - Include target rebase findings (if CVE mentioned in target's spec changelog, this is from the TARGET package) + - Include reference package findings (if a known-fixed package from intel was used for comparison) + - Include code analysis findings (patch targets, vulnerable vs fix patterns) + - Reference specific files, line numbers, and code snippets + - Summarize findings; the rendered report places an "Extracted facts" section **after** the Evidence chain with verbatim spec Patch lines, changelog hits, and build log lines (when available)—do not invent `PatchN:` numbers or spec quotes; only state patch indices you could derive from the investigation text below, or point readers to *Extracted facts* for exact lines + + PHRASING GUIDANCE for code analysis findings: + - GOOD: "Code analysis verified that the patch modifies `filename.c` to address the vulnerability" + - GOOD: "Patch targets the `function_name()` function in `filename.c`" + - BAD: "L1 agent found the fix code in the source" (use "code agent" — do not say L1/L2) + - BAD: "L2 says not vulnerable" (use "build agent") + - BAD: "Found fix in source" (unclear what was found) + - Use active voice: "The patch adds validation..." not "Validation was found..." + +3. CODE SNIPPETS: + - ALWAYS output an empty list: "code_snippets": [] + - Code snippets are extracted PROGRAMMATICALLY from parsed patches after your response + - Do NOT generate code snippets - it wastes tokens and they get overwritten anyway + - Instead, focus on populating affected_files with CVE-relevant source paths so patch hunks can be prioritized. + +4. EXECUTIVE SUMMARY (scenario-aware; use "code agent" and "build agent", never L1/L2): + + Read BUILD_AGENT_STATUS first: + + A) Build agent status: ran_with_override (Override verdict is not "none"): + - Write 4-5 sentences. + - Sentence 1 (Verdict): State final posture clearly + - Sentences 2-3 (Technical Context): Nature of the flaw and why the build agent negates or mitigates it + - Sentences 4-5 (Reconciliation): Why the code agent found patterns in source but the build agent changes exploitability + + B) Build agent status: ran_no_override: + - Write 3-5 sentences. + - Sentence 1: State verdict (code agent label stands) + - Include 1-2 sentences on compilation status and/or hardening from BUILD_AGENT_CONTEXT when present + - Do NOT say the build agent is absent, missing, or "context is not present" + + C) Build agent status: not_run: + - Write 3-4 sentences on code agent and patch intel only + - Do NOT mention the build agent at all + + FORBIDDEN phrases (never write these): + - "build agent context is not present" + - "L2 context is not present" + - "verdict is based solely on the code agent analysis" (when BUILD_AGENT_STATUS is ran_no_override) + - "because the build agent did not run" (when BUILD_AGENT_STATUS is ran_no_override) + + Do NOT invent RHSA IDs, function names, or technical details not present in the context. + +5. PATCH ANALYSIS (semantic fix narrative): + - When target package patch or reference patch evidence exists, briefly describe **what** the fix does: name the function(s) or file(s) and the nature of the change (e.g. "adds range validation 15–17 in parse_rockridge_ZF1"). + - Derive this from Target Package Analysis, Reference Intel, patch file names, or code agent excerpts—do NOT invent code or function names absent from investigation results. + +6. DELIVERY MODEL: + - When a CVE-named patch file is present, explicitly note that the fix is carried as a separate `%patch` directive while the upstream tarball (`Source0`) version may remain unchanged. + - Encourage citing "Extracted facts" for exact spec `PatchN:` and `Source0`/`Version` lines when shown below. + +7. CAVEATS (optional): + - Note any missing data (no patch file, no build log, etc.) + - Flag low-confidence findings that may need manual review + - Leave empty if no significant gaps exist + + + +Provide a structured JSON response with: +- justification_label: one of the labels above +- executive_summary: 3-5 sentence summary (see Instruction #4 for structure) +- evidence_chain: list of evidence items in logical order +- affected_files: list of source files involved +- patch_analysis: analysis of patches (or null if none) +- code_snippets: ALWAYS an empty list [] (snippets are extracted programmatically from patches) +- caveats: list of investigation gaps or uncertainties (empty list if none) + +Ensure all code snippets and special characters within JSON string values are properly escaped +(e.g., quotes as \", backslashes as \\, newlines as \\n) to maintain valid JSON format. + +""" diff --git a/src/vuln_analysis/utils/tests/test_vulnerability_intel_sanitizer.py b/src/vuln_analysis/utils/tests/test_vulnerability_intel_sanitizer.py new file mode 100644 index 000000000..d98ccbc35 --- /dev/null +++ b/src/vuln_analysis/utils/tests/test_vulnerability_intel_sanitizer.py @@ -0,0 +1,98 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for VulnerabilityIntelSanitizer v1.""" + +from exploit_iq_commons.data_models.checker_status import VulnerabilityIntel + +from vuln_analysis.functions.code_agent_graph_defs import ParsedPatch, PatchFile +from vuln_analysis.utils.vulnerability_intel_sanitizer import VulnerabilityIntelSanitizer + + +def _patch_with_util_c() -> ParsedPatch: + return ParsedPatch( + patch_filename="cve.patch", + files=[ + PatchFile( + source_path="tar/util.c", + target_path="tar/util.c", + hunks=[], + ) + ], + ) + + +class TestSanitizeAffectedFiles: + def test_clears_affected_files_when_no_patch(self): + raw = VulnerabilityIntel(affected_files=["generator.c", "tar/util.c"]) + result = VulnerabilityIntelSanitizer(None).apply(raw) + assert result.affected_files == [] + + def test_clears_affected_files_when_empty_files(self): + patch = ParsedPatch(patch_filename="empty.patch", files=[]) + raw = VulnerabilityIntel(affected_files=["util.c"]) + result = VulnerabilityIntelSanitizer(patch).apply(raw) + assert result.affected_files == [] + + def test_keeps_matching_basename_with_patch(self): + raw = VulnerabilityIntel(affected_files=["util.c"]) + result = VulnerabilityIntelSanitizer(_patch_with_util_c()).apply(raw) + assert result.affected_files == ["util.c"] + + def test_keeps_full_path_when_basename_matches_patch(self): + raw = VulnerabilityIntel(affected_files=["tar/util.c"]) + result = VulnerabilityIntelSanitizer(_patch_with_util_c()).apply(raw) + assert result.affected_files == ["tar/util.c"] + + def test_drops_non_patch_file_with_patch(self): + raw = VulnerabilityIntel(affected_files=["util.c", "generator.c"]) + result = VulnerabilityIntelSanitizer(_patch_with_util_c()).apply(raw) + assert result.affected_files == ["util.c"] + + +class TestFilterVulnerableFunctions: + def test_drops_function_with_spaces(self): + raw = VulnerabilityIntel( + vulnerable_functions=["parse_header", "rsync compares file checksums"], + ) + result = VulnerabilityIntelSanitizer(_patch_with_util_c()).apply(raw) + assert result.vulnerable_functions == ["parse_header"] + + def test_drops_function_with_spaces_without_patch(self): + raw = VulnerabilityIntel(vulnerable_functions=["rsync compares file checksums"]) + result = VulnerabilityIntelSanitizer(None).apply(raw) + assert result.vulnerable_functions == [] + + +class TestFilterSearchKeywords: + def test_drops_keyword_with_spaces_no_boolean(self): + raw = VulnerabilityIntel( + search_keywords=["s2length", "sum2", "rsync compares file checksums"], + ) + result = VulnerabilityIntelSanitizer(None).apply(raw) + assert result.search_keywords == ["s2length", "sum2"] + + def test_keeps_keyword_with_or(self): + raw = VulnerabilityIntel(search_keywords=["s2length OR sum2"]) + result = VulnerabilityIntelSanitizer(None).apply(raw) + assert result.search_keywords == ["s2length OR sum2"] + + def test_keeps_keyword_with_and(self): + raw = VulnerabilityIntel(search_keywords=["foo AND bar"]) + result = VulnerabilityIntelSanitizer(None).apply(raw) + assert result.search_keywords == ["foo AND bar"] + + +class TestRsyncStyleNoPatch: + def test_strips_hallucinated_paths_and_prose(self): + raw = VulnerabilityIntel( + affected_files=["generator.c", "src/foo.c"], + vulnerable_functions=["rsync compares file checksums"], + vulnerable_variables=["s2length", "sum2"], + search_keywords=["s2length", "sum2", "rsync compares file checksums"], + ) + result = VulnerabilityIntelSanitizer(None).apply(raw) + assert result.affected_files == [] + assert result.vulnerable_functions == [] + assert result.vulnerable_variables == ["s2length", "sum2"] + assert result.search_keywords == ["s2length", "sum2"] diff --git a/src/vuln_analysis/utils/tests/test_web_patch_fetcher.py b/src/vuln_analysis/utils/tests/test_web_patch_fetcher.py new file mode 100644 index 000000000..42f70dbaa --- /dev/null +++ b/src/vuln_analysis/utils/tests/test_web_patch_fetcher.py @@ -0,0 +1,426 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for web_patch_fetcher module.""" + +import pytest +from unittest.mock import MagicMock, patch + +from vuln_analysis.utils.web_patch_fetcher import ( + WebPatchFetcher, + build_patch_url_from_repo, + _GITHUB_COMMIT_PATTERN, + _GITHUB_PR_PATTERN, + _GITWEB_COMMIT_PATTERN, + _KERNEL_CGIT_COMMIT_PATTERN, + _KERNEL_SHORT_PATTERN, +) + + +class TestUrlPatterns: + """Test URL pattern matching.""" + + def test_github_commit_pattern(self): + """Test GitHub commit URL pattern matching.""" + url = "https://github.com/libarchive/libarchive/commit/a2a73a8f14b3208c7f6acbbc93265254a7c1efd0" + match = _GITHUB_COMMIT_PATTERN.match(url) + assert match is not None + assert match.group(1) == "libarchive/libarchive" + assert match.group(2) == "a2a73a8f14b3208c7f6acbbc93265254a7c1efd0" + + def test_github_commit_pattern_short_sha(self): + """Test GitHub commit URL with short SHA.""" + url = "https://github.com/curl/curl/commit/39d1976b7f" + match = _GITHUB_COMMIT_PATTERN.match(url) + assert match is not None + assert match.group(1) == "curl/curl" + assert match.group(2) == "39d1976b7f" + + def test_github_pr_pattern(self): + """Test GitHub PR URL pattern matching.""" + url = "https://github.com/libarchive/libarchive/pull/2934" + match = _GITHUB_PR_PATTERN.match(url) + assert match is not None + assert match.group(1) == "libarchive/libarchive" + assert match.group(2) == "2934" + + def test_kernel_cgit_commit_pattern(self): + """Test kernel.org cgit commit URL pattern.""" + url = ( + "https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git" + "/commit/?id=f342de4e2f33e0e39165d8639387aa6c19dff660" + ) + match = _KERNEL_CGIT_COMMIT_PATTERN.match(url) + assert match is not None + expected_repo = "https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git" + assert match.group(1) == expected_repo + assert match.group(2) == "f342de4e2f33e0e39165d8639387aa6c19dff660" + + def test_kernel_short_pattern(self): + """Test kernel.org short URL pattern.""" + url = "https://git.kernel.org/stable/c/096bb5b43edf755bc4477e64004fa3a20539ec2f" + match = _KERNEL_SHORT_PATTERN.match(url) + assert match is not None + assert match.group(1) == "stable" + assert match.group(2) == "096bb5b43edf755bc4477e64004fa3a20539ec2f" + + def test_gitweb_commit_pattern(self): + """Test gitweb commit URL pattern.""" + url = "https://git.samba.org/?p=rsync.git;a=commit;h=6c8ca91c731b7bf2b081694bda85b7dadc2b7aff" + match = _GITWEB_COMMIT_PATTERN.match(url) + assert match is not None + assert match.group(1) == "https://git.samba.org" + assert match.group(2) == "rsync.git" + assert match.group(3) == "6c8ca91c731b7bf2b081694bda85b7dadc2b7aff" + + +class TestWebPatchFetcherUrlResolution: + """Test URL resolution in WebPatchFetcher.""" + + @pytest.fixture + def fetcher(self): + """Create a WebPatchFetcher with a mock session.""" + mock_session = MagicMock() + return WebPatchFetcher(session=mock_session) + + def test_resolve_github_commit_url(self, fetcher): + """Test resolving GitHub commit URL to patch URL.""" + url = "https://github.com/curl/curl/commit/39d1976b7f" + resolved = fetcher._resolve_github_url(url) + assert resolved is not None + assert resolved.patch_url == "https://github.com/curl/curl/commit/39d1976b7f.patch" + assert resolved.platform == "github" + assert resolved.url_type == "commit" + assert resolved.repo_url == "https://github.com/curl/curl" + assert resolved.commit_sha == "39d1976b7f" + + def test_resolve_github_pr_url(self, fetcher): + """Test resolving GitHub PR URL to patch URL.""" + url = "https://github.com/libarchive/libarchive/pull/2934" + resolved = fetcher._resolve_github_url(url) + assert resolved is not None + assert resolved.patch_url == "https://github.com/libarchive/libarchive/pull/2934.patch" + assert resolved.platform == "github" + assert resolved.url_type == "pull" + assert resolved.repo_url == "https://github.com/libarchive/libarchive" + assert resolved.commit_sha == "PR-2934" + + def test_resolve_ubuntu_style_url(self, fetcher): + """Test resolving Ubuntu-style 'upstream:' URL.""" + url = "upstream: https://github.com/curl/curl/commit/39d1976b7f" + resolved = fetcher._resolve_github_url(url) + assert resolved is not None + assert resolved.patch_url == "https://github.com/curl/curl/commit/39d1976b7f.patch" + assert resolved.platform == "github" + assert resolved.url_type == "commit" + + def test_resolve_kernel_cgit_commit_url(self, fetcher): + """Test resolving kernel.org cgit commit URL.""" + url = ( + "https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git" + "/commit/?id=f342de4e2f33" + ) + resolved = fetcher._resolve_kernel_org_url(url) + assert resolved is not None + expected_patch = ( + "https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git" + "/patch/?id=f342de4e2f33" + ) + assert resolved.patch_url == expected_patch + assert resolved.platform == "kernel.org" + assert resolved.url_type == "commit" + assert resolved.commit_sha == "f342de4e2f33" + + def test_resolve_kernel_short_stable_url(self, fetcher): + """Test resolving kernel.org short stable URL.""" + url = "https://git.kernel.org/stable/c/096bb5b43edf" + resolved = fetcher._resolve_kernel_org_url(url) + assert resolved is not None + expected_patch = ( + "https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git" + "/patch/?id=096bb5b43edf" + ) + assert resolved.patch_url == expected_patch + assert resolved.platform == "kernel.org" + assert resolved.commit_sha == "096bb5b43edf" + + def test_resolve_kernel_short_torvalds_url(self, fetcher): + """Test resolving kernel.org short torvalds URL.""" + url = "https://git.kernel.org/torvalds/c/abc123def456" + resolved = fetcher._resolve_kernel_org_url(url) + assert resolved is not None + expected_patch = ( + "https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git" + "/patch/?id=abc123def456" + ) + assert resolved.patch_url == expected_patch + assert resolved.platform == "kernel.org" + + def test_resolve_unknown_url_returns_none(self, fetcher): + """Test that unknown URLs return None.""" + url = "https://example.com/some/path" + resolved = fetcher._resolve_to_patch_url(url) + assert resolved is None + + def test_resolve_gitweb_commit_url(self, fetcher): + """Test resolving git.samba.org gitweb commit URL.""" + url = ( + "https://git.samba.org/?p=rsync.git;a=commit;" + "h=6c8ca91c731b7bf2b081694bda85b7dadc2b7aff" + ) + resolved = fetcher._resolve_gitweb_url(url) + assert resolved is not None + assert resolved.platform == "gitweb" + assert resolved.patch_url == ( + "https://git.samba.org/?p=rsync.git;a=patch;" + "h=6c8ca91c731b7bf2b081694bda85b7dadc2b7aff" + ) + assert resolved.repo_url == "https://git.samba.org/rsync.git" + assert resolved.commit_sha == "6c8ca91c731b7bf2b081694bda85b7dadc2b7aff" + + def test_resolve_gitweb_patch_url(self, fetcher): + """Test resolving gitweb patch URL directly.""" + url = ( + "https://git.samba.org/?p=rsync.git;a=patch;" + "h=6c8ca91c731b7bf2b081694bda85b7dadc2b7aff" + ) + resolved = fetcher._resolve_gitweb_url(url) + assert resolved is not None + assert resolved.platform == "gitweb" + assert resolved.patch_url == url + + +class TestBuildPatchUrlFromRepo: + """Test patch URL construction from OSV repo + commit.""" + + def test_samba_rsync_gitweb(self): + repo = "https://git.samba.org/rsync.git" + sha = "6c8ca91c731b7bf2b081694bda85b7dadc2b7aff" + patch_url = build_patch_url_from_repo(repo, sha) + assert patch_url == f"https://git.samba.org/?p=rsync.git;a=patch;h={sha}" + + def test_kernel_cgit_path_info(self): + repo = "https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git" + sha = "f342de4e2f33" + patch_url = build_patch_url_from_repo(repo, sha) + assert patch_url == f"{repo}/patch/?id={sha}" + + def test_github_repo(self): + repo = "https://github.com/openssl/openssl" + sha = "abc123def456" + patch_url = build_patch_url_from_repo(repo, sha) + assert patch_url == "https://github.com/openssl/openssl/commit/abc123def456.patch" + + +class TestPrioritizeAndDedupe: + """Test URL prioritization and deduplication.""" + + @pytest.fixture + def fetcher(self): + """Create a WebPatchFetcher with a mock session.""" + mock_session = MagicMock() + return WebPatchFetcher(session=mock_session) + + def test_prioritize_commit_over_pull(self, fetcher): + """Test that commit URLs are prioritized over PR URLs.""" + candidates = { + "ghsa": [ + "https://github.com/foo/bar/pull/123", + "https://github.com/foo/bar/commit/abc123", + ], + } + result = fetcher._prioritize_and_dedupe(candidates) + assert len(result) == 2 + assert result[0][2] == "commit" # commit first + assert result[1][2] == "pull" # PR second + + def test_prioritize_ubuntu_patches_first(self, fetcher): + """Test that ubuntu_patches source is prioritized for commits.""" + candidates = { + "ghsa": ["https://github.com/foo/bar/commit/ghsa123"], + "ubuntu_patches": ["upstream: https://github.com/foo/bar/commit/ubuntu123"], + } + result = fetcher._prioritize_and_dedupe(candidates) + assert len(result) == 2 + assert result[0][1] == "ubuntu_patches" # ubuntu_patches first + assert result[1][1] == "ghsa" + + def test_deduplicate_same_url(self, fetcher): + """Test that duplicate URLs are deduplicated.""" + candidates = { + "ghsa": ["https://github.com/foo/bar/commit/abc123"], + "nvd": ["https://github.com/foo/bar/commit/abc123"], + "rhsa": ["https://github.com/foo/bar/commit/abc123"], + } + result = fetcher._prioritize_and_dedupe(candidates) + assert len(result) == 1 # Only one unique URL + + def test_deduplicate_with_trailing_slash(self, fetcher): + """Test deduplication handles trailing slashes.""" + candidates = { + "ghsa": ["https://github.com/foo/bar/commit/abc123"], + "nvd": ["https://github.com/foo/bar/commit/abc123/"], + } + result = fetcher._prioritize_and_dedupe(candidates) + assert len(result) == 1 + + def test_kernel_org_urls_prioritized_as_commits(self, fetcher): + """Test that kernel.org URLs are recognized as commit URLs.""" + candidates = { + "nvd": [ + "https://git.kernel.org/stable/c/abc123", + "https://github.com/foo/bar/pull/456", + ], + } + result = fetcher._prioritize_and_dedupe(candidates) + assert len(result) == 2 + assert result[0][2] == "commit" # kernel.org first (commit) + assert result[1][2] == "pull" # PR second + + def test_empty_candidates(self, fetcher): + """Test handling of empty candidates.""" + candidates = {} + result = fetcher._prioritize_and_dedupe(candidates) + assert result == [] + + def test_skip_non_matching_urls(self, fetcher): + """Test that non-commit/PR URLs are skipped.""" + candidates = { + "ghsa": [ + "https://github.com/advisories/GHSA-xxxx-xxxx-xxxx", + "https://example.com/something", + ], + } + result = fetcher._prioritize_and_dedupe(candidates) + assert result == [] # Advisory URLs don't match commit/PR patterns + + +class TestNormalizeUrl: + """Test URL normalization for deduplication.""" + + @pytest.fixture + def fetcher(self): + mock_session = MagicMock() + return WebPatchFetcher(session=mock_session) + + def test_normalize_removes_trailing_slash(self, fetcher): + url = "https://github.com/foo/bar/commit/abc123/" + normalized = fetcher._normalize_url_for_dedupe(url) + assert not normalized.endswith("/") + + def test_normalize_lowercases(self, fetcher): + url = "https://GitHub.com/Foo/Bar/Commit/ABC123" + normalized = fetcher._normalize_url_for_dedupe(url) + assert normalized == normalized.lower() + + def test_normalize_removes_patch_suffix(self, fetcher): + url = "https://github.com/foo/bar/commit/abc123.patch" + normalized = fetcher._normalize_url_for_dedupe(url) + assert not normalized.endswith(".patch") + + +class TestIsCommitUrl: + """Test commit URL detection.""" + + @pytest.fixture + def fetcher(self): + mock_session = MagicMock() + return WebPatchFetcher(session=mock_session) + + def test_github_commit_is_commit_url(self, fetcher): + assert fetcher._is_commit_url("https://github.com/foo/bar/commit/abc123") + + def test_kernel_cgit_is_commit_url(self, fetcher): + assert fetcher._is_commit_url("https://git.kernel.org/pub/scm/linux.git/commit/?id=abc123") + + def test_kernel_short_is_commit_url(self, fetcher): + assert fetcher._is_commit_url("https://git.kernel.org/stable/c/abc123") + + def test_github_pr_is_not_commit_url(self, fetcher): + assert not fetcher._is_commit_url("https://github.com/foo/bar/pull/123") + + def test_advisory_is_not_commit_url(self, fetcher): + assert not fetcher._is_commit_url("https://github.com/advisories/GHSA-xxxx") + + def test_gitweb_commit_is_commit_url(self, fetcher): + url = "https://git.samba.org/?p=rsync.git;a=commit;h=6c8ca91c731b7bf2b081694bda85b7dadc2b7aff" + assert fetcher._is_commit_url(url) + + +class TestIsPrUrl: + """Test PR URL detection.""" + + @pytest.fixture + def fetcher(self): + mock_session = MagicMock() + return WebPatchFetcher(session=mock_session) + + def test_github_pr_is_pr_url(self, fetcher): + assert fetcher._is_pr_url("https://github.com/foo/bar/pull/123") + + def test_github_commit_is_not_pr_url(self, fetcher): + assert not fetcher._is_pr_url("https://github.com/foo/bar/commit/abc123") + + +@pytest.mark.asyncio +class TestFetchFromIntelRefs: + """Test fetch_from_intel_refs integration.""" + + @pytest.fixture + def mock_session(self): + return MagicMock() + + async def test_fetch_from_intel_refs_empty_candidates(self, mock_session): + """Test handling of empty candidates.""" + fetcher = WebPatchFetcher(session=mock_session) + result = await fetcher.fetch_from_intel_refs({}, "CVE-2024-1234") + assert result is None + + async def test_fetch_from_intel_refs_returns_first_success(self, mock_session): + """Test that first successful fetch is returned.""" + fetcher = WebPatchFetcher(session=mock_session) + + # Mock the fetch_from_url method + with patch.object(fetcher, 'fetch_from_url') as mock_fetch: + mock_result = MagicMock() + mock_result.parsed_patch = MagicMock() + mock_fetch.return_value = mock_result + + candidates = { + "ubuntu_patches": ["upstream: https://github.com/foo/bar/commit/abc123"], + } + result = await fetcher.fetch_from_intel_refs(candidates, "CVE-2024-1234") + + assert result is mock_result + mock_fetch.assert_called_once() + + async def test_fetch_from_intel_refs_tries_multiple_urls(self, mock_session): + """Test that multiple URLs are tried on failure.""" + fetcher = WebPatchFetcher(session=mock_session) + + with patch.object(fetcher, 'fetch_from_url') as mock_fetch: + # First call returns None (failure), second returns success + mock_result = MagicMock() + mock_result.parsed_patch = MagicMock() + mock_fetch.side_effect = [None, mock_result] + + candidates = { + "ubuntu_patches": ["upstream: https://github.com/foo/bar/commit/abc123"], + "ghsa": ["https://github.com/foo/bar/commit/def456"], + } + result = await fetcher.fetch_from_intel_refs(candidates, "CVE-2024-1234") + + assert result is mock_result + assert mock_fetch.call_count == 2 diff --git a/src/vuln_analysis/utils/token_utils.py b/src/vuln_analysis/utils/token_utils.py index c3e435892..3624e3e80 100644 --- a/src/vuln_analysis/utils/token_utils.py +++ b/src/vuln_analysis/utils/token_utils.py @@ -79,6 +79,21 @@ def truncate_tool_output(tool_output: str, tool_name: str, max_tokens: int = 400 kept_lines.append(f"[... truncated {token_count - kept_tokens} tokens ...]") return '\n'.join(kept_lines) + if tool_name == ToolNames.SOURCE_GREP: + blocks = tool_output.split('\n--\n') + kept_blocks, kept_tokens = [], 0 + for block in blocks: + block_tokens = count_tokens(block) + if kept_tokens + block_tokens > max_tokens: + break + kept_blocks.append(block) + kept_tokens += block_tokens + truncated_count = len(blocks) - len(kept_blocks) + result = '\n--\n'.join(kept_blocks) + if truncated_count > 0: + result += f"\n[... truncated {truncated_count} more matches ...]" + return result + head_budget = int(max_tokens * 0.7) tail_budget = max_tokens - head_budget head_lines, head_tokens = [], 0 diff --git a/src/vuln_analysis/utils/vulnerability_intel_sanitizer.py b/src/vuln_analysis/utils/vulnerability_intel_sanitizer.py new file mode 100644 index 000000000..a703d0e96 --- /dev/null +++ b/src/vuln_analysis/utils/vulnerability_intel_sanitizer.py @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Sanitize LLM-extracted VulnerabilityIntel after L1 extraction.""" + +from __future__ import annotations + +import re +from pathlib import Path + +from exploit_iq_commons.data_models.checker_status import VulnerabilityIntel +from vuln_analysis.functions.code_agent_graph_defs import ParsedPatch + +_BOOLEAN_OP_RE = re.compile(r"\s+(?:OR|AND)\s+", re.IGNORECASE) + + +def _patch_basenames(parsed_patch: ParsedPatch) -> set[str]: + names: set[str] = set() + for patch_file in parsed_patch.files: + for path in (patch_file.source_path, patch_file.target_path): + if path: + names.add(Path(path).name.lower()) + return names + + +def _has_boolean_operator(keyword: str) -> bool: + return bool(_BOOLEAN_OP_RE.search(keyword)) + + +class VulnerabilityIntelSanitizer: + """Apply shape rules to L1 VulnerabilityIntel; extensible one method per rule.""" + + def __init__(self, parsed_patch: ParsedPatch | None = None) -> None: + self._parsed_patch = parsed_patch + + @property + def _has_trusted_patch(self) -> bool: + return self._parsed_patch is not None and bool(self._parsed_patch.files) + + def apply(self, intel: VulnerabilityIntel) -> VulnerabilityIntel: + intel = self.sanitize_affected_files(intel) + intel = self.filter_vulnerable_functions(intel) + return self.filter_search_keywords(intel) + + def sanitize_affected_files(self, intel: VulnerabilityIntel) -> VulnerabilityIntel: + parsed_patch = self._parsed_patch + if parsed_patch is None or not parsed_patch.files: + return intel.model_copy(update={"affected_files": []}) + + allowed = _patch_basenames(parsed_patch) + kept = [ + path + for path in intel.affected_files + if Path(path).name.lower() in allowed + ] + return intel.model_copy(update={"affected_files": kept}) + + def filter_vulnerable_functions(self, intel: VulnerabilityIntel) -> VulnerabilityIntel: + kept = [name for name in intel.vulnerable_functions if " " not in name] + return intel.model_copy(update={"vulnerable_functions": kept}) + + def filter_search_keywords(self, intel: VulnerabilityIntel) -> VulnerabilityIntel: + kept = [ + kw + for kw in intel.search_keywords + if " " not in kw or _has_boolean_operator(kw) + ] + return intel.model_copy(update={"search_keywords": kept}) diff --git a/src/vuln_analysis/utils/web_patch_fetcher.py b/src/vuln_analysis/utils/web_patch_fetcher.py new file mode 100644 index 000000000..cd0d2018b --- /dev/null +++ b/src/vuln_analysis/utils/web_patch_fetcher.py @@ -0,0 +1,1073 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Web Patch Fetcher - unified patch fetching from intel references and OSV API. + +This module provides a unified interface for fetching upstream fix patches from: +- Intel references (GHSA, NVD, RHSA, Ubuntu) +- Chromium issue tracker (via Gerrit/Gitiles) +- OSV API (fallback) + +Supported platforms: +- GitHub (/commit/, /pull/) +- kernel.org (cgit format, short /stable/c/ format) +- gitweb (e.g. git.samba.org — ``?p=repo.git;a=patch;h=``) +- gitiles (Chromium/Google source, base64-encoded patches) +""" + +from __future__ import annotations + +import base64 +import os +import re +from typing import Literal, TYPE_CHECKING + +from urllib.parse import urlparse, unquote + +import aiohttp +from pydantic import BaseModel +from unidiff import PatchSet + +from exploit_iq_commons.logging.loggers_factory import LoggingFactory +from vuln_analysis.utils.async_http_utils import request_with_retry +from vuln_analysis.utils.intel_utils import CHROMIUM_ISSUE_PATTERN +from vuln_analysis.utils.gerrit_client import ( + search_changes_by_bug, + list_merged_changes, + select_gerrit_change, + get_current_commit_sha, + project_to_gitiles_repo_url, + build_gitiles_patch_url, +) +from vuln_analysis.functions.code_agent_graph_defs import ( + OSVPatchResult, + ParsedPatch, + PatchFile, + PatchHunk, +) + +if TYPE_CHECKING: + from langchain_core.language_models import BaseChatModel + +logger = LoggingFactory.get_agent_logger(__name__) + +# Re-export for backwards compatibility +WebPatchResult = OSVPatchResult + +# Environment configuration +_GITHUB_TOKEN = os.environ.get("GHSA_API_KEY") +_OSV_API_URL = os.environ.get("OSV_API_URL", "https://api.osv.dev/v1/vulns/") +_OSV_TIMEOUT_SECONDS = int(os.environ.get("OSV_TIMEOUT_SECONDS", "10")) +_PATCH_FETCH_TIMEOUT_SECONDS = int(os.environ.get("PATCH_FETCH_TIMEOUT_SECONDS", "30")) + +# Binary file extensions to skip when parsing patches +_BINARY_FILE_EXTENSIONS = frozenset({ + '.uu', '.uue', '.iso', '.bin', '.gz', '.bz2', '.xz', '.zip', '.tar', '.tgz', '.tbz2', + '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.ico', '.webp', + '.pdf', '.doc', '.docx', '.xls', '.xlsx', + '.exe', '.dll', '.so', '.dylib', '.a', '.o', '.obj', + '.pyc', '.pyo', '.class', '.jar', '.war', +}) + +# URL patterns for platform detection and resolution +_GITHUB_COMMIT_PATTERN = re.compile( + r"https?://github\.com/([^/]+/[^/]+)/commit/([a-f0-9]+)" +) +_GITHUB_PR_PATTERN = re.compile( + r"https?://github\.com/([^/]+/[^/]+)/pull/(\d+)" +) +_GITHUB_REPO_PATTERN = re.compile( + r"https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$" +) + +# kernel.org cgit patterns +_KERNEL_CGIT_COMMIT_PATTERN = re.compile( + r"(https://git\.kernel\.org/.+\.git)/commit/\?id=([a-f0-9]+)" +) +_KERNEL_CGIT_PATCH_PATTERN = re.compile( + r"https://git\.kernel\.org/.+\.git/patch/\?id=[a-f0-9]+" +) +_KERNEL_SHORT_PATTERN = re.compile( + r"https://git\.kernel\.org/(\w+)/c/([a-f0-9]+)" +) + +# Mapping from kernel.org short paths to full repo paths +_KERNEL_SHORT_PATH_MAP = { + "stable": "linux/kernel/git/stable/linux.git", + "torvalds": "linux/kernel/git/torvalds/linux.git", + "next": "linux/kernel/git/next/linux-next.git", +} + +# Ubuntu patch reference pattern (e.g., "upstream: https://github.com/.../commit/...") +_UBUNTU_PATCH_URL_PATTERN = re.compile( + r"(?:upstream:\s*)?(https://github\.com/[^/]+/[^/]+/commit/([a-f0-9]+))" +) + +# gitweb (Samba, etc.): ?p=repo.git;a=commit|patch;h= +_GITWEB_COMMIT_PATTERN = re.compile( + r"(https?://[^/]+)/\?p=([^;&]+\.git);a=commit;h=([a-f0-9]+)" +) +_GITWEB_PATCH_PATTERN = re.compile( + r"(https?://[^/]+)/\?p=([^;&]+\.git);a=patch;h=([a-f0-9]+)" +) + +# Gitiles (Chromium/Google source): /+/ with optional ^! and ?format=TEXT +# Matches: https://chromium.googlesource.com/angle/angle/+/abc123 +# https://chromium.googlesource.com/chromium/src/+/abc123%5E%21?format=TEXT +_GITILES_COMMIT_PATTERN = re.compile( + r"(https://[^/]+\.googlesource\.com/[^+]+)/\+/([a-f0-9]+)(?:%5E%21|\^!)?" +) + + +# --------------------------------------------------------------------------- +# Data Models +# --------------------------------------------------------------------------- + +class OSVAffectedRange(BaseModel): + """Represents a Git range from an OSV affected block.""" + repo_url: str | None = None + fixed_commit: str | None = None + introduced_commit: str | None = None + + +class ResolvedUrl(BaseModel): + """Result of resolving a URL to a patch download URL.""" + patch_url: str + platform: Literal["github", "kernel.org", "gitweb", "cgit", "gitiles"] + url_type: Literal["commit", "pull", "chromium_issue"] + repo_url: str | None = None + commit_sha: str | None = None + + +# --------------------------------------------------------------------------- +# Helper Functions +# --------------------------------------------------------------------------- + +def _is_binary_file_path(path: str) -> bool: + """Check if file path has a binary file extension.""" + path_lower = path.lower() + return any(path_lower.endswith(ext) for ext in _BINARY_FILE_EXTENSIONS) + + +def _parse_patch_content(patch_content: str, patch_filename: str) -> ParsedPatch | None: + """Parse patch content string into structured ParsedPatch model.""" + try: + patch_set = PatchSet.from_string(patch_content) + except Exception: + logger.warning("_parse_patch_content: failed to parse patch content") + return None + + files: list[PatchFile] = [] + for patched_file in patch_set: + if patched_file.is_binary_file: + continue + if _is_binary_file_path(patched_file.target_file): + continue + + hunks: list[PatchHunk] = [] + for hunk in patched_file: + context, removed, added = [], [], [] + for line in hunk: + if line.is_context: + context.append(str(line.value).rstrip("\n")) + elif line.is_removed: + removed.append(str(line.value).rstrip("\n")) + elif line.is_added: + added.append(str(line.value).rstrip("\n")) + + hunks.append(PatchHunk( + source_start=hunk.source_start, + source_length=hunk.source_length, + target_start=hunk.target_start, + target_length=hunk.target_length, + context_lines=context, + removed_lines=removed, + added_lines=added, + )) + + files.append(PatchFile( + source_path=patched_file.source_file, + target_path=patched_file.target_file, + hunks=hunks, + is_new_file=patched_file.is_added_file, + is_deleted_file=patched_file.is_removed_file, + )) + + return ParsedPatch(patch_filename=patch_filename, files=files) + + +def _extract_commit_metadata(patch_content: str) -> tuple[str | None, str | None, str | None]: + """Extract commit message, author, and date from .patch format. + + Works with both GitHub and kernel.org patch formats. + """ + lines = patch_content.split('\n') + author = None + date = None + subject_lines = [] + in_subject = False + + for line in lines: + if line.startswith('From:'): + author = line[5:].strip() + elif line.startswith('Date:'): + date = line[5:].strip() + elif line.startswith('Subject:'): + in_subject = True + subject_part = line[8:].strip() + if subject_part.startswith('[PATCH'): + idx = subject_part.find(']') + if idx != -1: + subject_part = subject_part[idx + 1:].strip() + subject_lines.append(subject_part) + elif in_subject: + if line.startswith('---') or line.startswith('diff --git'): + break + if line.strip() == '': + in_subject = False + else: + subject_lines.append(line.strip()) + + commit_message = ' '.join(subject_lines).strip() if subject_lines else None + return commit_message, author, date + + +def build_patch_url_from_repo(repo_url: str, commit_sha: str) -> str | None: + """Build a patch download URL from a git repo URL and commit SHA. + + Supports GitHub, gitweb (shallow ``host/repo.git`` paths), and cgit path-info. + """ + match = _GITHUB_REPO_PATTERN.match(repo_url) + if match: + repo_path = match.group(1) + return f"https://github.com/{repo_path}/commit/{commit_sha}.patch" + if "github.com" in repo_url: + parts = repo_url.rstrip('/').split('/') + if len(parts) >= 2: + repo_path = '/'.join(parts[-2:]).replace('.git', '') + return f"https://github.com/{repo_path}/commit/{commit_sha}.patch" + + parsed = urlparse(repo_url.rstrip('/')) + if not parsed.path.endswith('.git'): + return None + + path_parts = [p for p in parsed.path.split('/') if p] + if not path_parts: + return None + + project = path_parts[-1] + base = f"{parsed.scheme}://{parsed.netloc}" + + # Shallow repo path (e.g. git.samba.org/rsync.git) → gitweb + if len(path_parts) == 1: + return f"{base}/?p={project};a=patch;h={commit_sha}" + + # Deep path (e.g. git.kernel.org/pub/scm/.../linux.git) → cgit path-info + return f"{repo_url.rstrip('/')}/patch/?id={commit_sha}" + + +# --------------------------------------------------------------------------- +# WebPatchFetcher - Unified Patch Fetcher +# --------------------------------------------------------------------------- + +class WebPatchFetcher: + """Unified patch fetcher for intel references and direct URLs. + + Supports: + - GitHub commit URLs (/commit/) + - GitHub PR URLs (/pull/) + - kernel.org cgit URLs (/commit/?id=, /patch/?id=) + - kernel.org short URLs (/stable/c/, /torvalds/c/) + - gitweb URLs (?p=repo.git;a=commit|patch;h=) + + Usage: + async with aiohttp.ClientSession() as session: + fetcher = WebPatchFetcher(session) + result = await fetcher.fetch_from_intel_refs(candidates, "CVE-2024-1234") + if result: + # Use result.parsed_patch + pass + """ + + def __init__( + self, + session: aiohttp.ClientSession, + timeout: int = _PATCH_FETCH_TIMEOUT_SECONDS, + ): + self._session = session + self._timeout = aiohttp.ClientTimeout(total=timeout) + self._headers = {"User-Agent": "vuln-analysis/1.0"} + if _GITHUB_TOKEN: + self._headers["Authorization"] = f"token {_GITHUB_TOKEN}" + self._cache: dict[str, WebPatchResult] = {} + + async def fetch_from_intel_refs( + self, + candidates: dict[str, list[str]], + cve_id: str, + cve_description: str | None = None, + llm: "BaseChatModel | None" = None, + ) -> WebPatchResult | None: + """Fetch patch from intel reference URLs in priority order. + + Priority: + 1. /commit/ URLs (ubuntu_patches first, then ghsa, nvd, rhsa, ubuntu) + 2. /pull/ URLs (ghsa, nvd, rhsa, ubuntu) + 3. Chromium issue URLs (require Gerrit/Gitiles resolution) + + Args: + candidates: Dict mapping source to list of URLs + Example: {"ghsa": ["https://github.com/.../commit/..."], "nvd": [...]} + cve_id: CVE identifier for logging and result + cve_description: Optional CVE description for Chromium CL disambiguation + llm: Optional LangChain LLM for Chromium CL selection when multiple MERGED CLs exist + + Returns: + WebPatchResult on success, None if no valid patch found + """ + prioritized = self._prioritize_and_dedupe(candidates) + + if not prioritized: + logger.debug("No candidate URLs found for %s", cve_id) + return None + + urls_tried = 0 + for url, source, url_type in prioritized: + urls_tried += 1 + + if url_type == "chromium_issue": + # Handle Chromium issue URLs via Gerrit/Gitiles + result = await self._fetch_chromium_issue( + url, cve_id, source, cve_description, llm + ) + else: + # Standard commit/PR URL handling + result = await self.fetch_from_url(url, cve_id, source=source, url_type_hint=url_type) + + if result and result.parsed_patch: + logger.info( + "Intel refs: Found patch for %s from %s (%s, %s)", + cve_id, source, url_type, result.platform + ) + return result + + logger.info("Intel refs: No valid patch found for %s after trying %d URLs", cve_id, urls_tried) + return None + + async def fetch_from_url( + self, + url: str, + cve_id: str, + source: str | None = None, + url_type_hint: str | None = None, + ) -> WebPatchResult | None: + """Fetch and parse a patch from a single URL. + + Args: + url: URL to fetch (commit, PR, or patch URL) + cve_id: CVE identifier + source: Source provider (for result metadata) + url_type_hint: URL type hint (commit/pull) + + Returns: + WebPatchResult on success, None on failure + """ + # Check cache first + cache_key = url.lower().rstrip("/") + if cache_key in self._cache: + logger.debug("Cache hit for %s", url) + return self._cache[cache_key] + + # Resolve URL to patch download URL + resolved = self._resolve_to_patch_url(url) + if not resolved: + logger.debug("Could not resolve URL to patch format: %s", url) + return None + + # Fetch patch content + patch_content = await self._fetch_patch_content(resolved.patch_url) + if not patch_content: + return None + + # Extract metadata and parse + commit_message, commit_author, commit_date = _extract_commit_metadata(patch_content) + commit_sha = resolved.commit_sha or "unknown" + parsed_patch = _parse_patch_content( + patch_content, + f"{cve_id}_{commit_sha[:8]}.patch" + ) + + if not parsed_patch: + logger.warning("Failed to parse patch content from %s", url) + return None + + result = WebPatchResult( + cve_id=cve_id, + fixed_commit=commit_sha[:8] if len(commit_sha) > 8 else commit_sha, + repo_url=resolved.repo_url or "", + patch_url=resolved.patch_url, + patch_content=patch_content, + parsed_patch=parsed_patch, + commit_message=commit_message, + commit_author=commit_author, + commit_date=commit_date, + source=source, + url_type=url_type_hint or resolved.url_type, + platform=resolved.platform, + ) + + # Cache successful result + self._cache[cache_key] = result + return result + + async def _fetch_chromium_issue( + self, + issue_url: str, + cve_id: str, + source: str, + cve_description: str | None, + llm: "BaseChatModel | None", + ) -> WebPatchResult | None: + """Fetch patch for a Chromium issue via Gerrit/Gitiles. + + Flow: + 1. Extract bug ID from issue URL + 2. Search Gerrit for CLs with that bug ID + 3. Filter to MERGED CLs only + 4. Select the correct CL (single = use it, multiple = LLM selection) + 5. Get commit SHA from selected CL + 6. Fetch patch from Gitiles + + Args: + issue_url: Chromium issue URL (e.g., https://issues.chromium.org/issues/466192044) + cve_id: CVE identifier + source: Source provider (ghsa, nvd, etc.) + cve_description: CVE description for LLM disambiguation + llm: Optional LangChain LLM for CL selection + + Returns: + WebPatchResult on success, None on failure + """ + # Extract bug ID from URL + match = CHROMIUM_ISSUE_PATTERN.match(issue_url) + if not match: + logger.debug("Invalid Chromium issue URL: %s", issue_url) + return None + + bug_id = match.group(1) + logger.info("Chromium issue: Searching Gerrit for bug %s", bug_id) + + # Search Gerrit for CLs with this bug ID + raw_changes = await search_changes_by_bug(self._session, bug_id) + if not raw_changes: + logger.info("Chromium issue: No CLs found for bug %s", bug_id) + return None + + # Filter to MERGED only + merged = list_merged_changes(raw_changes) + if not merged: + logger.info("Chromium issue: No MERGED CLs found for bug %s", bug_id) + return None + + logger.info( + "Chromium issue: Found %d MERGED CLs for bug %s: %s", + len(merged), bug_id, [c.submission_id for c in merged] + ) + + # Select the correct CL + selected_id = await select_gerrit_change( + merged, cve_id, cve_description or "", llm + ) + if selected_id is None: + logger.info("Chromium issue: Could not select CL for bug %s", bug_id) + return None + + # Find the selected candidate to get project info + selected = next((c for c in merged if c.submission_id == selected_id), None) + if not selected: + logger.warning("Chromium issue: Selected ID %d not found in candidates", selected_id) + return None + + logger.info( + "Chromium issue: Selected CL %d (%s) for bug %s", + selected_id, selected.project, bug_id + ) + + # Get commit SHA from the selected CL + commit_sha = await get_current_commit_sha(self._session, selected_id) + if not commit_sha: + logger.warning("Chromium issue: Could not get commit SHA for CL %d", selected_id) + return None + + # Build Gitiles URL and fetch patch + repo_url = project_to_gitiles_repo_url(selected.project) + patch_url = build_gitiles_patch_url(repo_url, commit_sha) + + logger.info("Chromium issue: Fetching patch from %s", patch_url) + + # Fetch patch content (Gitiles base64 decoding handled automatically) + patch_content = await self._fetch_patch_content(patch_url) + if not patch_content: + logger.warning("Chromium issue: Failed to fetch patch from Gitiles") + return None + + # Parse patch + commit_message, commit_author, commit_date = _extract_commit_metadata(patch_content) + parsed_patch = _parse_patch_content( + patch_content, + f"{cve_id}_{commit_sha[:8]}.patch" + ) + + if not parsed_patch: + logger.warning("Chromium issue: Failed to parse patch from Gitiles") + return None + + return WebPatchResult( + cve_id=cve_id, + fixed_commit=commit_sha[:8] if len(commit_sha) > 8 else commit_sha, + repo_url=repo_url, + patch_url=patch_url, + patch_content=patch_content, + parsed_patch=parsed_patch, + commit_message=commit_message or selected.subject, + commit_author=commit_author, + commit_date=commit_date, + source=source, + url_type="chromium_issue", + platform="gitiles", + ) + + def _prioritize_and_dedupe( + self, + candidates: dict[str, list[str]], + ) -> list[tuple[str, str, str]]: + """Prioritize and deduplicate candidate URLs. + + Returns list of (url, source, url_type) tuples in priority order. + + Priority order: + 1. /commit/ URLs (direct patches, highest confidence) + 2. /pull/ URLs (may contain multiple commits) + 3. Chromium issue URLs (require Gerrit/Gitiles resolution, lowest priority) + """ + seen: set[str] = set() + result: list[tuple[str, str, str]] = [] + + # Priority 1: /commit/ URLs (and kernel.org commit URLs) + # Order: ubuntu_patches first (curated), then other sources + commit_sources = ["ubuntu_patches", "ghsa", "nvd", "rhsa", "ubuntu"] + for source in commit_sources: + for url in candidates.get(source, []): + normalized = self._normalize_url_for_dedupe(url) + if self._is_commit_url(url) and normalized not in seen: + seen.add(normalized) + result.append((url, source, "commit")) + + # Priority 2: /pull/ URLs + pr_sources = ["ghsa", "nvd", "rhsa", "ubuntu"] + for source in pr_sources: + for url in candidates.get(source, []): + normalized = self._normalize_url_for_dedupe(url) + if self._is_pr_url(url) and normalized not in seen: + seen.add(normalized) + result.append((url, source, "pull")) + + # Priority 3: Chromium issue URLs (lowest priority - require Gerrit/Gitiles resolution) + chromium_sources = ["ghsa", "nvd", "rhsa", "ubuntu"] + for source in chromium_sources: + for url in candidates.get(source, []): + normalized = self._normalize_url_for_dedupe(url) + if self._is_chromium_issue_url(url) and normalized not in seen: + seen.add(normalized) + result.append((url, source, "chromium_issue")) + + return result + + def _normalize_url_for_dedupe(self, url: str) -> str: + """Normalize URL for deduplication.""" + # Remove trailing slash, convert to lowercase + normalized = url.rstrip("/").lower() + # Remove .patch suffix for comparison + if normalized.endswith(".patch"): + normalized = normalized[:-6] + # Remove query string for commit URLs (but keep for kernel.org) + if "github.com" in normalized and "?" in normalized: + normalized = normalized.split("?")[0] + return normalized + + def _is_commit_url(self, url: str) -> bool: + """Check if URL is a commit URL (GitHub, kernel.org, or gitweb).""" + url_lower = url.lower() + return ( + "/commit/" in url_lower or + "/c/" in url_lower or # kernel.org short form + "?id=" in url_lower or # cgit path-info form + ";a=commit;" in url_lower # gitweb form + ) + + def _is_pr_url(self, url: str) -> bool: + """Check if URL is a PR URL.""" + return "/pull/" in url.lower() + + def _is_chromium_issue_url(self, url: str) -> bool: + """Check if URL is a Chromium issue tracker URL. + + These URLs point to issues.chromium.org/issues/ and require + resolution via Gerrit/Gitiles to fetch the actual patch. + """ + return bool(CHROMIUM_ISSUE_PATTERN.match(url)) + + def _resolve_to_patch_url(self, url: str) -> ResolvedUrl | None: + """Resolve a URL to its patch download URL. + + Dispatches to platform-specific handlers. + """ + # Try GitHub first + resolved = self._resolve_github_url(url) + if resolved: + return resolved + + # Try gitweb (e.g. git.samba.org) + resolved = self._resolve_gitweb_url(url) + if resolved: + return resolved + + # Try kernel.org + resolved = self._resolve_kernel_org_url(url) + if resolved: + return resolved + + # Try Gitiles (Chromium/Google) + resolved = self._resolve_gitiles_url(url) + if resolved: + return resolved + + return None + + def _resolve_github_url(self, url: str) -> ResolvedUrl | None: + """Resolve GitHub commit or PR URL to patch URL.""" + # GitHub commit URL + match = _GITHUB_COMMIT_PATTERN.match(url) + if match: + repo_path, commit_sha = match.groups() + patch_url = f"https://github.com/{repo_path}/commit/{commit_sha}.patch" + return ResolvedUrl( + patch_url=patch_url, + platform="github", + url_type="commit", + repo_url=f"https://github.com/{repo_path}", + commit_sha=commit_sha, + ) + + # GitHub PR URL + match = _GITHUB_PR_PATTERN.match(url) + if match: + repo_path, pr_number = match.groups() + patch_url = f"https://github.com/{repo_path}/pull/{pr_number}.patch" + return ResolvedUrl( + patch_url=patch_url, + platform="github", + url_type="pull", + repo_url=f"https://github.com/{repo_path}", + commit_sha=f"PR-{pr_number}", + ) + + # Ubuntu-style "upstream: https://github.com/.../commit/..." + match = _UBUNTU_PATCH_URL_PATTERN.search(url) + if match: + commit_url, commit_sha = match.groups() + patch_url = f"{commit_url}.patch" + repo_url = commit_url.rsplit("/commit/", 1)[0] + return ResolvedUrl( + patch_url=patch_url, + platform="github", + url_type="commit", + repo_url=repo_url, + commit_sha=commit_sha, + ) + + return None + + def _resolve_gitweb_url(self, url: str) -> ResolvedUrl | None: + """Resolve gitweb commit or patch URL (e.g. git.samba.org).""" + match = _GITWEB_PATCH_PATTERN.match(url) + if match: + base, project, commit_sha = match.groups() + repo_url = f"{base}/{project}" + return ResolvedUrl( + patch_url=url, + platform="gitweb", + url_type="commit", + repo_url=repo_url, + commit_sha=commit_sha, + ) + + match = _GITWEB_COMMIT_PATTERN.match(url) + if match: + base, project, commit_sha = match.groups() + patch_url = f"{base}/?p={project};a=patch;h={commit_sha}" + repo_url = f"{base}/{project}" + return ResolvedUrl( + patch_url=patch_url, + platform="gitweb", + url_type="commit", + repo_url=repo_url, + commit_sha=commit_sha, + ) + + return None + + def _resolve_kernel_org_url(self, url: str) -> ResolvedUrl | None: + """Resolve kernel.org URL to patch URL.""" + # Already a patch URL + if _KERNEL_CGIT_PATCH_PATTERN.match(url): + # Extract commit SHA from ?id= parameter + sha_match = re.search(r"\?id=([a-f0-9]+)", url) + commit_sha = sha_match.group(1) if sha_match else "unknown" + repo_url = url.split("/patch/")[0] if "/patch/" in url else None + return ResolvedUrl( + patch_url=url, + platform="kernel.org", + url_type="commit", + repo_url=repo_url, + commit_sha=commit_sha, + ) + + # cgit commit URL: .../commit/?id= + match = _KERNEL_CGIT_COMMIT_PATTERN.match(url) + if match: + repo_base, commit_sha = match.groups() + patch_url = f"{repo_base}/patch/?id={commit_sha}" + return ResolvedUrl( + patch_url=patch_url, + platform="kernel.org", + url_type="commit", + repo_url=repo_base, + commit_sha=commit_sha, + ) + + # Short form: /stable/c/, /torvalds/c/, etc. + match = _KERNEL_SHORT_PATTERN.match(url) + if match: + tree, commit_sha = match.groups() + repo_path = _KERNEL_SHORT_PATH_MAP.get(tree) + if repo_path: + patch_url = f"https://git.kernel.org/pub/scm/{repo_path}/patch/?id={commit_sha}" + repo_url = f"https://git.kernel.org/pub/scm/{repo_path}" + return ResolvedUrl( + patch_url=patch_url, + platform="kernel.org", + url_type="commit", + repo_url=repo_url, + commit_sha=commit_sha, + ) + else: + logger.warning("Unknown kernel.org tree: %s", tree) + + return None + + def _resolve_gitiles_url(self, url: str) -> ResolvedUrl | None: + """Resolve Gitiles commit URL to patch URL. + + Gitiles is used by Chromium and other Google projects. + The patch is fetched via /+/^! with ?format=TEXT which returns base64-encoded content. + """ + # Decode URL-encoded characters for pattern matching + decoded_url = unquote(url) + + match = _GITILES_COMMIT_PATTERN.match(decoded_url) + if not match: + # Try with the original URL (might have literal ^!) + match = _GITILES_COMMIT_PATTERN.match(url) + if not match: + return None + + repo_url, commit_sha = match.groups() + # Build patch URL with ^! suffix (pre-encoded) and format=TEXT + # Must use %5E%21 with yarl.URL(encoded=True) to avoid mixed encoding + patch_url = f"{repo_url}/+/{commit_sha}%5E%21?format=TEXT" + + return ResolvedUrl( + patch_url=patch_url, + platform="gitiles", + url_type="commit", + repo_url=repo_url, + commit_sha=commit_sha, + ) + + async def _fetch_patch_content(self, patch_url: str) -> str | None: + """Fetch patch content from URL with rate limit handling. + + Automatically handles Gitiles base64-encoded responses. + """ + is_gitiles = ".googlesource.com" in patch_url and "format=TEXT" in patch_url + + # For Gitiles URLs, use a fresh session with yarl.URL(encoded=True) + # to preserve the exact %5E%21 encoding and avoid re-encoding issues + if is_gitiles: + content = await self._fetch_gitiles_patch(patch_url) + if content: + return self._decode_gitiles_response(content) + else: + return None + + try: + async with request_with_retry( + session=self._session, + request_kwargs={ + 'method': 'GET', + 'url': patch_url, + 'timeout': self._timeout, + 'headers': self._headers, + }, + max_retries=3, + sleep_time=0.5, + log_on_error=False, + ) as response: + content = await response.text() + return content + + except aiohttp.ClientResponseError as e: + if e.status == 404: + logger.debug("Patch not found: %s", patch_url) + elif e.status == 429: + logger.warning("Rate limited on %s, skipping", patch_url) + else: + logger.warning("Patch fetch failed: %s - %s", patch_url, e) + return None + except Exception as e: + logger.warning("Patch fetch failed: %s - %s", patch_url, e) + return None + + async def _fetch_gitiles_patch(self, patch_url: str) -> str | None: + """Fetch patch from Gitiles using a fresh session. + + Gitiles requires exact URL encoding for ^! suffix (%5E%21). + Using a fresh session avoids any URL re-encoding issues from shared sessions. + """ + import requests + # 2. Fetch the encoded data + response = requests.get(patch_url,timeout=(3.05, 15)) + + if response.status_code == 200: + return response.text + else: + logger.warning("Gitiles patch fetch failed: %s - %s", patch_url, response.status_code) + return None + + + def _decode_gitiles_response(self, content: str) -> str | None: + """Decode Gitiles base64-encoded patch response. + + Args: + content: Base64-encoded content from Gitiles ?format=TEXT endpoint + + Returns: + Decoded patch content, or None on decode failure + """ + try: + decoded_bytes = base64.b64decode(content) + return decoded_bytes.decode("utf-8") + except Exception as e: + logger.warning("Failed to decode Gitiles base64 response: %s", e) + return None + + +# --------------------------------------------------------------------------- +# OSVClient - OSV API Client +# --------------------------------------------------------------------------- + +class OSVClient: + """Client for querying OSV API and fetching fix patches. + + Usage: + async with aiohttp.ClientSession() as session: + fetcher = WebPatchFetcher(session) + client = OSVClient(session, fetcher) + result = await client.get_fix_patch("CVE-2024-1234", "3.0.7", "openssl") + """ + + def __init__( + self, + session: aiohttp.ClientSession, + patch_fetcher: WebPatchFetcher | None = None, + osv_timeout: int = _OSV_TIMEOUT_SECONDS, + ): + self._session = session + self._patch_fetcher = patch_fetcher or WebPatchFetcher(session) + self._osv_timeout = aiohttp.ClientTimeout(total=osv_timeout) + + async def get_fix_patch( + self, + cve_id: str, + upstream_version: str | None = None, # pylint: disable=unused-argument + package_name: str | None = None, # pylint: disable=unused-argument + ) -> WebPatchResult | None: + """Query OSV API and fetch the fix patch. + + Args: + cve_id: CVE identifier (e.g., "CVE-2024-1234") + upstream_version: Upstream version (e.g., "3.0.7") - reserved for future filtering + package_name: Optional package name - reserved for future filtering + + Returns: + WebPatchResult with patch data, or None if no fix found + """ + try: + osv_data = await self._query_osv(cve_id) + if not osv_data: + return None + + # Try to get fix commit URL from references first (most specific) + patch_url = self._extract_commit_from_references(osv_data) + repo_url = None + + if patch_url: + # Extract repo_url from the patch URL + if '/commit/' in patch_url: + repo_url = patch_url.split('/commit/')[0] + elif '/pull/' in patch_url: + repo_url = patch_url.split('/pull/')[0] + logger.info("OSV: Found fix commit in references for %s", cve_id) + else: + return None + #OSV affected GIT fixed events mark version boundaries (e.g. release tags), + #not security patches, so using them as + # fix diffs sends the checker the wrong upstream patch. + # Find fix commit from affected block + #affected = self._find_matching_affected(osv_data) + #if not affected: + # logger.info("OSV: No affected block with fix found for %s", cve_id) + # return None + + #range_info = self._extract_fix_commit(affected) + #if not range_info.fixed_commit or not range_info.repo_url: + # logger.info("OSV: No fixed commit found for %s", cve_id) + # return None + + #patch_url = self._build_patch_url(range_info.repo_url, range_info.fixed_commit) + #repo_url = range_info.repo_url + + if not patch_url: + logger.info("OSV: Could not build patch URL for %s (unsupported repo host?)", cve_id) + return None + + # Use WebPatchFetcher to fetch and parse + url_for_resolution = patch_url + if patch_url.endswith('.patch') and 'github.com' in patch_url: + url_for_resolution = patch_url[:-6] + result = await self._patch_fetcher.fetch_from_url( + url_for_resolution, + cve_id, + source="osv", + ) + + if result: + # Ensure we have the correct repo_url + if repo_url and not result.repo_url: + result.repo_url = repo_url + return result + + return None + + except Exception: + logger.warning("OSV patch retrieval failed for %s", cve_id, exc_info=True) + return None + + async def _query_osv(self, cve_id: str) -> dict | None: + """Query OSV API for CVE data.""" + url = f"{_OSV_API_URL}{cve_id}" + try: + async with request_with_retry( + session=self._session, + request_kwargs={ + 'method': 'GET', + 'url': url, + 'timeout': self._osv_timeout, + }, + max_retries=3, + sleep_time=0.5, + log_on_error=False, + ) as response: + return await response.json() + except aiohttp.ClientResponseError as e: + if e.status == 404: + logger.info("OSV: CVE %s not found", cve_id) + else: + logger.warning("OSV query failed for %s: %s", cve_id, e) + return None + except Exception as e: + logger.warning("OSV query failed for %s: %s", cve_id, e) + return None + + def _extract_commit_from_references(self, osv_data: dict) -> str | None: + """Extract fix commit URL from OSV references.""" + references = osv_data.get("references", []) + + for ref in references: + if ref.get("type") == "FIX": + url = ref.get("url", "") + if "github.com" in url and ("/commit/" in url or "/pull/" in url): + if not url.endswith(".patch"): + return f"{url}.patch" + return url + if ";a=commit;" in url: + return url.replace(";a=commit;", ";a=patch;") + if ";a=patch;" in url: + return url + + return None + + def _find_matching_affected(self, osv_data: dict) -> dict | None: + """Find an affected block with a GIT range containing a fixed commit.""" + for affected in osv_data.get("affected", []): + for range_block in affected.get("ranges", []): + if range_block.get("type") == "GIT": + for event in range_block.get("events", []): + if "fixed" in event: + return affected + return None + + def _extract_fix_commit(self, affected: dict) -> OSVAffectedRange: + """Extract fixed commit hash and repo URL from affected block.""" + result = OSVAffectedRange() + + ranges = affected.get("ranges", []) + for range_block in ranges: + if range_block.get("type") != "GIT": + continue + + repo = range_block.get("repo") + if repo: + result.repo_url = repo + + events = range_block.get("events", []) + for event in events: + if "introduced" in event and event["introduced"] != "0": + result.introduced_commit = event["introduced"] + if "fixed" in event: + result.fixed_commit = event["fixed"] + + if result.fixed_commit: + break + + return result + + def _build_patch_url(self, repo_url: str, commit_sha: str) -> str | None: + """Build patch download URL from repo URL and commit SHA.""" + patch_url = build_patch_url_from_repo(repo_url, commit_sha) + if not patch_url: + logger.debug("Unsupported repo URL: %s", repo_url) + return patch_url diff --git a/tests/test_brew_downloader.py b/tests/test_brew_downloader.py new file mode 100644 index 000000000..819e810c0 --- /dev/null +++ b/tests/test_brew_downloader.py @@ -0,0 +1,123 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Unit tests for BrewDownloader profile resolution and artifact orchestration.""" + +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from vuln_analysis.tools.brew_downloader import ( + BrewDownloader, + BrewProfileNotImplementedError, + BrewProfileType, + resolve_brew_profile, +) + + +class TestResolveBrewProfile: + def test_resolves_internal_from_config(self, monkeypatch): + monkeypatch.delenv("RPM_USER_TYPE", raising=False) + assert resolve_brew_profile("internal") == BrewProfileType.INTERNAL + + def test_resolves_external_from_config(self, monkeypatch): + monkeypatch.delenv("RPM_USER_TYPE", raising=False) + assert resolve_brew_profile("external") == BrewProfileType.EXTERNAL + + def test_env_overrides_config(self, monkeypatch): + monkeypatch.setenv("RPM_USER_TYPE", "external") + assert resolve_brew_profile("internal") == BrewProfileType.EXTERNAL + + def test_yaml_env_default_syntax_without_expansion(self, monkeypatch): + monkeypatch.delenv("RPM_USER_TYPE", raising=False) + assert resolve_brew_profile("${RPM_USER_TYPE:-internal}") == BrewProfileType.INTERNAL + + def test_yaml_env_default_syntax_uses_env(self, monkeypatch): + monkeypatch.setenv("RPM_USER_TYPE", "external") + assert resolve_brew_profile("${RPM_USER_TYPE:-internal}") == BrewProfileType.EXTERNAL + + def test_unknown_profile_raises(self, monkeypatch): + monkeypatch.delenv("RPM_USER_TYPE", raising=False) + with pytest.raises(BrewProfileNotImplementedError): + resolve_brew_profile("unknown") + + +class TestBrewProfileFlags: + def test_external_profile_disables_build_log_fetch(self, tmp_path): + downloader = BrewDownloader( + BrewProfileType.EXTERNAL, + str(tmp_path / "rpms"), + str(tmp_path / "checker"), + ) + assert downloader.auto_fetch_build_log is False + assert downloader.download_binary_rpm_enabled is False + + def test_internal_profile_enables_build_log_fetch(self, tmp_path): + downloader = BrewDownloader( + BrewProfileType.INTERNAL, + str(tmp_path / "rpms"), + str(tmp_path / "checker"), + ) + assert downloader.auto_fetch_build_log is True + + +class TestDownloadTargetArtifacts: + def test_skips_build_log_when_auto_fetch_disabled(self, tmp_path): + rpm_cache = tmp_path / "rpms" + checker_dir = tmp_path / "checker" + downloader = BrewDownloader( + BrewProfileType.EXTERNAL, + str(rpm_cache), + str(checker_dir), + ) + build = {"id": 1, "nvr": "curl-8.11.1-8.fc42"} + srpm_file = rpm_cache / "curl-8.11.1-8.fc42.src.rpm" + srpm_file.write_bytes(b"fake-srpm") + + downloader._session = MagicMock() + downloader._pathinfo = MagicMock() + + with ( + patch.object(downloader, "search_build", return_value=build), + patch.object(downloader, "_get_srpm_url", return_value="https://example/srpm"), + patch.object(downloader, "download_srpm", return_value=srpm_file), + patch.object(downloader, "download_build_log") as mock_build_log, + patch( + "vuln_analysis.tools.brew_downloader.SourceRPMDownloader.extract_src_rpm", + ), + ): + artifacts = downloader.download_target_artifacts( + "curl", "8.11.1", "8.fc42", "x86_64", + ) + + mock_build_log.assert_not_called() + assert artifacts.build_log_path is None + assert artifacts.srpm_path == checker_dir / "source" + + +class TestSourceAcquisitionCacheCondition: + """Mirror the cache-hit predicate used in cve_source_acquisition.""" + + @staticmethod + def _is_full_cache_hit(source_exists: bool, log_exists: bool, auto_fetch_build_log: bool) -> bool: + return source_exists and (log_exists or not auto_fetch_build_log) + + def test_source_only_hit_when_auto_fetch_false(self): + assert self._is_full_cache_hit( + source_exists=True, + log_exists=False, + auto_fetch_build_log=False, + ) + + def test_requires_log_when_auto_fetch_true(self): + assert not self._is_full_cache_hit( + source_exists=True, + log_exists=False, + auto_fetch_build_log=True, + ) + assert self._is_full_cache_hit( + source_exists=True, + log_exists=True, + auto_fetch_build_log=True, + ) diff --git a/tests/test_package_identifier.py b/tests/test_package_identifier.py new file mode 100644 index 000000000..993a7a0f2 --- /dev/null +++ b/tests/test_package_identifier.py @@ -0,0 +1,409 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest + +from exploit_iq_commons.data_models.checker_status import EnumIdentifyResult, PackageCheckerStatus +from exploit_iq_commons.data_models.common import TargetPackage +from exploit_iq_commons.data_models.cve_intel import CveIntel, CveIntelRhsa, CveIntelNvd + +from vuln_analysis.utils.package_identifier import ( + PackageIdentifier, + _extract_rhel_version, + _interpret_fix_state, + _match_package_state_for_distro, +) + + +class TestExtractRhelVersion: + """Tests for _extract_rhel_version helper.""" + + def test_el7_extracts_7(self): + assert _extract_rhel_version("el7") == "7" + + def test_el8_extracts_8(self): + assert _extract_rhel_version("el8") == "8" + + def test_el10_extracts_10(self): + assert _extract_rhel_version("el10") == "10" + + def test_none_returns_none(self): + assert _extract_rhel_version(None) is None + + def test_empty_string_returns_none(self): + assert _extract_rhel_version("") is None + + def test_invalid_format_returns_none(self): + assert _extract_rhel_version("rhel7") is None + assert _extract_rhel_version("centos7") is None + + +class TestInterpretFixState: + """Tests for _interpret_fix_state helper.""" + + def test_not_affected_returns_no(self): + assert _interpret_fix_state("Not affected") == EnumIdentifyResult.NO + assert _interpret_fix_state("not affected") == EnumIdentifyResult.NO + assert _interpret_fix_state("NOT AFFECTED") == EnumIdentifyResult.NO + + def test_will_not_fix_returns_no(self): + assert _interpret_fix_state("Will not fix") == EnumIdentifyResult.NO + assert _interpret_fix_state("will not fix") == EnumIdentifyResult.NO + + def test_out_of_support_scope_returns_no(self): + assert _interpret_fix_state("Out of support scope") == EnumIdentifyResult.NO + assert _interpret_fix_state("out of support scope") == EnumIdentifyResult.NO + + def test_affected_returns_yes(self): + assert _interpret_fix_state("Affected") == EnumIdentifyResult.YES + assert _interpret_fix_state("affected") == EnumIdentifyResult.YES + + def test_fix_deferred_returns_yes(self): + assert _interpret_fix_state("Fix deferred") == EnumIdentifyResult.YES + assert _interpret_fix_state("fix deferred") == EnumIdentifyResult.YES + + def test_under_investigation_returns_yes(self): + assert _interpret_fix_state("Under investigation") == EnumIdentifyResult.YES + + def test_none_returns_none(self): + assert _interpret_fix_state(None) is None + + def test_empty_returns_none(self): + assert _interpret_fix_state("") is None + + def test_unknown_state_returns_none(self): + assert _interpret_fix_state("Fixed") is None + assert _interpret_fix_state("Unknown") is None + assert _interpret_fix_state("Pending") is None + + +class TestMatchPackageStateForDistro: + """Tests for _match_package_state_for_distro helper.""" + + @pytest.fixture + def rhel6_package_state(self): + return CveIntelRhsa.PackageState( + product_name="Red Hat Enterprise Linux 6", + fix_state="Not affected", + package_name="libarchive", + cpe="cpe:/o:redhat:enterprise_linux:6", + ) + + @pytest.fixture + def rhel7_package_state(self): + return CveIntelRhsa.PackageState( + product_name="Red Hat Enterprise Linux 7", + fix_state="Will not fix", + package_name="libarchive", + cpe="cpe:/o:redhat:enterprise_linux:7", + ) + + @pytest.fixture + def rhel8_package_state(self): + return CveIntelRhsa.PackageState( + product_name="Red Hat Enterprise Linux 8", + fix_state="Affected", + package_name="libarchive", + cpe="cpe:/o:redhat:enterprise_linux:8", + ) + + def test_matches_by_cpe(self, rhel6_package_state, rhel7_package_state): + states = [rhel6_package_state, rhel7_package_state] + + result = _match_package_state_for_distro(states, "libarchive", "el7") + assert result is not None + assert result.fix_state == "Will not fix" + + def test_matches_el6(self, rhel6_package_state, rhel7_package_state): + states = [rhel6_package_state, rhel7_package_state] + + result = _match_package_state_for_distro(states, "libarchive", "el6") + assert result is not None + assert result.fix_state == "Not affected" + + def test_no_distro_returns_first_name_match(self, rhel6_package_state, rhel7_package_state): + states = [rhel6_package_state, rhel7_package_state] + + result = _match_package_state_for_distro(states, "libarchive", None) + assert result is not None + assert result == rhel6_package_state + + def test_no_package_match_returns_none(self, rhel7_package_state): + states = [rhel7_package_state] + + result = _match_package_state_for_distro(states, "curl", "el7") + assert result is None + + def test_no_distro_match_returns_none(self, rhel7_package_state): + states = [rhel7_package_state] + + result = _match_package_state_for_distro(states, "libarchive", "el9") + assert result is None + + def test_matches_by_product_name(self): + state = CveIntelRhsa.PackageState( + product_name="Red Hat Enterprise Linux 7", + fix_state="Will not fix", + package_name="libarchive", + cpe=None, # No CPE, should match by product_name + ) + + result = _match_package_state_for_distro([state], "libarchive", "el7") + assert result is not None + assert result.fix_state == "Will not fix" + + +class TestPackageIdentifierWithFixState: + """Integration tests for PackageIdentifier with RHSA fix_state.""" + + def test_cve_2016_8687_rhel7_will_not_fix(self): + """RHSA Will not fix on RHEL 7 -> NOT_VUL even when NVD version would match.""" + target = TargetPackage( + name="libarchive", + version="3.1.2", + release="14.el7_9.1", + arch="x86_64", + ) + + intel = CveIntel( + vuln_id="CVE-2016-8687", + nvd=CveIntelNvd( + cve_id="CVE-2016-8687", + configurations=[ + CveIntelNvd.Configuration( + package="libarchive", + vendor="libarchive", + versionStartIncluding="3.2.1", + versionEndIncluding="3.2.1", + ), + ], + ), + rhsa=CveIntelRhsa( + package_state=[ + CveIntelRhsa.PackageState( + product_name="Red Hat Enterprise Linux 6", + fix_state="Not affected", + package_name="libarchive", + cpe="cpe:/o:redhat:enterprise_linux:6", + ), + CveIntelRhsa.PackageState( + product_name="Red Hat Enterprise Linux 7", + fix_state="Will not fix", + package_name="libarchive", + cpe="cpe:/o:redhat:enterprise_linux:7", + ), + ], + ), + ) + + identifier = PackageIdentifier(target) + status, result = identifier.identify(intel) + + assert result.is_target_package_affected == EnumIdentifyResult.NO + assert status == PackageCheckerStatus.PKG_IDENT_NOT_VUL + assert "Will not fix" in result.conclusion_reason + + def test_cve_2016_8687_rhel6_not_affected(self): + """CVE-2016-8687 on RHEL 6 should be NO (Not affected).""" + target = TargetPackage( + name="libarchive", + version="3.0.3", + release="6.el6_10", + arch="x86_64", + ) + + intel = CveIntel( + vuln_id="CVE-2016-8687", + nvd=CveIntelNvd( + cve_id="CVE-2016-8687", + configurations=[ + CveIntelNvd.Configuration( + package="libarchive", + vendor="libarchive", + versionStartIncluding="3.2.1", + versionEndIncluding="3.2.1", + ), + ], + ), + rhsa=CveIntelRhsa( + package_state=[ + CveIntelRhsa.PackageState( + product_name="Red Hat Enterprise Linux 6", + fix_state="Not affected", + package_name="libarchive", + cpe="cpe:/o:redhat:enterprise_linux:6", + ), + CveIntelRhsa.PackageState( + product_name="Red Hat Enterprise Linux 7", + fix_state="Will not fix", + package_name="libarchive", + cpe="cpe:/o:redhat:enterprise_linux:7", + ), + ], + ), + ) + + identifier = PackageIdentifier(target) + status, result = identifier.identify(intel) + + assert result.is_target_package_affected == EnumIdentifyResult.NO + assert status == PackageCheckerStatus.PKG_IDENT_NOT_VUL + # Verify conclusion_reason is populated with RHSA details + assert "RHSA fix_state" in result.conclusion_reason + assert "Not affected" in result.conclusion_reason + assert "libarchive" in result.conclusion_reason + assert "el6" in result.conclusion_reason + + def test_falls_back_to_nvd_when_no_fix_state(self): + """When RHSA has no fix_state, should fall back to NVD version check.""" + target = TargetPackage( + name="somelib", + version="1.0.0", + release="1.el8", + arch="x86_64", + ) + + intel = CveIntel( + vuln_id="CVE-2024-0001", + nvd=CveIntelNvd( + cve_id="CVE-2024-0001", + configurations=[ + CveIntelNvd.Configuration( + package="somelib", + vendor="somevendor", + versionStartIncluding="1.0.0", + versionEndIncluding="1.5.0", + ), + ], + ), + rhsa=CveIntelRhsa( + package_state=[ + CveIntelRhsa.PackageState( + product_name="Red Hat Enterprise Linux 8", + fix_state=None, # No fix_state, should fall back to NVD + package_name="somelib", + cpe="cpe:/o:redhat:enterprise_linux:8", + ), + ], + ), + ) + + identifier = PackageIdentifier(target) + status, result = identifier.identify(intel) + + # Should be YES because 1.0.0 is in NVD range [1.0.0, 1.5.0] + assert result.is_target_package_affected == EnumIdentifyResult.YES + assert status == PackageCheckerStatus.OK + + def test_nvd_version_outside_range_populates_conclusion_reason(self): + """When target version is outside NVD range, conclusion_reason should explain.""" + target = TargetPackage( + name="somelib", + version="2.0.0", # Outside the affected range + release="1.el8", + arch="x86_64", + ) + + intel = CveIntel( + vuln_id="CVE-2024-0001", + nvd=CveIntelNvd( + cve_id="CVE-2024-0001", + configurations=[ + CveIntelNvd.Configuration( + package="somelib", + vendor="somevendor", + versionStartIncluding="1.0.0", + versionEndExcluding="1.5.0", + ), + ], + ), + rhsa=CveIntelRhsa( + package_state=[ + CveIntelRhsa.PackageState( + product_name="Red Hat Enterprise Linux 8", + fix_state=None, # No fix_state, should fall back to NVD + package_name="somelib", + cpe="cpe:/o:redhat:enterprise_linux:8", + ), + ], + ), + ) + + identifier = PackageIdentifier(target) + status, result = identifier.identify(intel) + + # Should be NO because 2.0.0 is outside NVD range [1.0.0, 1.5.0) + assert result.is_target_package_affected == EnumIdentifyResult.NO + assert status == PackageCheckerStatus.PKG_IDENT_NOT_VUL + # Verify conclusion_reason is populated with NVD version range details + assert "NVD" in result.conclusion_reason + assert "version" in result.conclusion_reason.lower() + assert "somelib" in result.conclusion_reason + assert "2.0.0" in result.conclusion_reason + assert ">=1.0.0" in result.conclusion_reason + assert "<1.5.0" in result.conclusion_reason + + +class TestPackageIdentifierRhsaScope: + """Step 1: RHSA package_state + affected_release scope gate.""" + + def test_affected_release_only_passes_scope(self): + target = TargetPackage( + name="curl", + version="7.76.1", + release="26.el9", + arch="x86_64", + ) + intel = CveIntel( + vuln_id="CVE-2024-TEST", + rhsa=CveIntelRhsa( + package_state=[], + affected_release=[ + {"package": "curl-7.76.1-23.el9", "product_name": "Red Hat Enterprise Linux 9"}, + ], + ), + ) + identifier = PackageIdentifier(target) + status, result = identifier.identify(intel) + + assert status == PackageCheckerStatus.OK + assert result.fixed_rpm_list + + def test_not_in_either_rhsa_bucket_is_cve_mismatch(self): + target = TargetPackage( + name="curl", + version="7.76.1", + release="26.el9", + arch="x86_64", + ) + intel = CveIntel( + vuln_id="CVE-2024-TEST", + rhsa=CveIntelRhsa( + package_state=[ + CveIntelRhsa.PackageState( + package_name="webkit2gtk3", + fix_state="Affected", + product_name="Red Hat Enterprise Linux 9", + ), + ], + affected_release=[ + {"package": "webkit2gtk3-2.42.5-1.el9"}, + ], + ), + ) + identifier = PackageIdentifier(target) + status, result = identifier.identify(intel) + + assert status == PackageCheckerStatus.PKG_IDENT_CVE_MISMATCH + assert result.is_target_package_affected == EnumIdentifyResult.NO