diff --git a/.agents/skills/code-change-verification/SKILL.md b/.agents/skills/code-change-verification/SKILL.md deleted file mode 100644 index 1125839209..0000000000 --- a/.agents/skills/code-change-verification/SKILL.md +++ /dev/null @@ -1,35 +0,0 @@ ---- -name: code-change-verification -description: Run the mandatory verification stack when changes affect runtime code, tests, or build/test behavior in the OpenAI Agents Python repository. ---- - -# Code Change Verification - -## Overview - -Ensure work is only marked complete after formatting, linting, type checking, and tests pass. Use this skill when changes affect runtime code, tests, or build/test configuration. You can skip it for docs-only or repository metadata unless a user asks for the full stack. - -## Quick start - -1. Keep this skill at `./.agents/skills/code-change-verification` so it loads automatically for the repository. -2. macOS/Linux: `bash .agents/skills/code-change-verification/scripts/run.sh`. -3. Windows: `powershell -ExecutionPolicy Bypass -File .agents/skills/code-change-verification/scripts/run.ps1`. -4. If any command fails, fix the issue, rerun the script, and report the failing output. -5. Confirm completion only when all commands succeed with no remaining issues. - -## Manual workflow - -- If dependencies are not installed or have changed, run `make sync` first to install dev requirements via `uv`. -- Run from the repository root in this order: `make format`, `make lint`, `make typecheck`, `make tests`. -- Do not skip steps; stop and fix issues immediately when a command fails. -- Re-run the full stack after applying fixes so the commands execute in the required order. - -## Resources - -### scripts/run.sh - -- Executes the full verification sequence with fail-fast semantics from the repository root. Prefer this entry point to ensure the required commands run in the correct order. - -### scripts/run.ps1 - -- Windows-friendly wrapper that runs the same verification sequence with fail-fast semantics. Use from PowerShell with execution policy bypass if required by your environment. diff --git a/.agents/skills/code-change-verification/agents/openai.yaml b/.agents/skills/code-change-verification/agents/openai.yaml deleted file mode 100644 index 8ebf11e246..0000000000 --- a/.agents/skills/code-change-verification/agents/openai.yaml +++ /dev/null @@ -1,4 +0,0 @@ -interface: - display_name: "Code Change Verification" - short_description: "Run the required local verification stack" - default_prompt: "Use $code-change-verification to run the required local verification stack and report any failures." diff --git a/.agents/skills/code-change-verification/scripts/run.ps1 b/.agents/skills/code-change-verification/scripts/run.ps1 deleted file mode 100644 index c3b6c5e2c2..0000000000 --- a/.agents/skills/code-change-verification/scripts/run.ps1 +++ /dev/null @@ -1,38 +0,0 @@ -Set-StrictMode -Version Latest -$ErrorActionPreference = "Stop" - -$scriptDir = Split-Path -Parent $MyInvocation.MyCommand.Definition -$repoRoot = $null - -try { - $repoRoot = (& git -C $scriptDir rev-parse --show-toplevel 2>$null) -} catch { - $repoRoot = $null -} - -if (-not $repoRoot) { - $repoRoot = Resolve-Path (Join-Path $scriptDir "..\\..\\..\\..") -} - -Set-Location $repoRoot - -function Invoke-MakeStep { - param( - [Parameter(Mandatory = $true)][string]$Step - ) - - Write-Host "Running make $Step..." - & make $Step - - if ($LASTEXITCODE -ne 0) { - Write-Error "code-change-verification: make $Step failed with exit code $LASTEXITCODE." - exit $LASTEXITCODE - } -} - -Invoke-MakeStep -Step "format" -Invoke-MakeStep -Step "lint" -Invoke-MakeStep -Step "typecheck" -Invoke-MakeStep -Step "tests" - -Write-Host "code-change-verification: all commands passed." diff --git a/.agents/skills/code-change-verification/scripts/run.sh b/.agents/skills/code-change-verification/scripts/run.sh deleted file mode 100755 index d92505fe8b..0000000000 --- a/.agents/skills/code-change-verification/scripts/run.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash -# Fail fast on any error or undefined variable. -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -if command -v git >/dev/null 2>&1; then - REPO_ROOT="$(git -C "${SCRIPT_DIR}" rev-parse --show-toplevel 2>/dev/null || true)" -fi -REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_DIR}/../../../.." && pwd)}" - -cd "${REPO_ROOT}" - -echo "Running make format..." -make format - -echo "Running make lint..." -make lint - -echo "Running make typecheck..." -make typecheck - -echo "Running make tests..." -make tests - -echo "code-change-verification: all commands passed." diff --git a/.agents/skills/docs-sync/SKILL.md b/.agents/skills/docs-sync/SKILL.md deleted file mode 100644 index 32b3bb46da..0000000000 --- a/.agents/skills/docs-sync/SKILL.md +++ /dev/null @@ -1,76 +0,0 @@ ---- -name: docs-sync -description: Analyze main branch implementation and configuration to find missing, incorrect, or outdated documentation in docs/. Use when asked to audit doc coverage, sync docs with code, or propose doc updates/structure changes. Only update English docs under docs/** and never touch translated docs under docs/ja, docs/ko, or docs/zh. Provide a report and ask for approval before editing docs. ---- - -# Docs Sync - -## Overview - -Identify doc coverage gaps and inaccuracies by comparing main branch features and configuration options against the current docs structure, then propose targeted improvements. - -## Workflow - -1. Confirm scope and base branch - - Identify the current branch and default branch (usually `main`). - - Prefer analyzing the current branch to keep work aligned with in-flight changes. - - If the current branch is not `main`, analyze only the diff vs `main` to scope doc updates. - - Avoid switching branches if it would disrupt local changes; use `git show main:` or `git worktree add` when needed. - -2. Build a feature inventory from the selected scope - - If on `main`: inventory the full surface area and review docs comprehensively. - - If not on `main`: inventory only changes vs `main` (feature additions/changes/removals). - - Focus on user-facing behavior: public exports, configuration options, environment variables, CLI commands, default values, and documented runtime behaviors. - - Capture evidence for each item (file path + symbol/setting). - - Use targeted search to find option types and feature flags (for example: `rg "Settings"`, `rg "Config"`, `rg "os.environ"`, `rg "OPENAI_"`). - - When the topic involves OpenAI platform features, invoke `$openai-knowledge` to pull current details from the OpenAI Developer Docs MCP server instead of guessing, while treating the SDK source code as the source of truth when discrepancies appear. - -3. Doc-first pass: review existing pages - - Walk each relevant page under `docs/` (excluding `docs/ja`, `docs/ko`, and `docs/zh`). - - Identify missing mentions of important, supported options (opt-in flags, env vars), customization points, or new features from `src/agents/` and `examples/`. - - Propose additions where users would reasonably expect to find them on that page. - -4. Code-first pass: map features to docs - - Review the current docs information architecture under `docs/` and `mkdocs.yml`. - - Determine the best page/section for each feature based on existing patterns and the API reference structure under `docs/ref`. - - Identify features that lack any doc page or have a page but no corresponding content. - - Note when a structural adjustment would improve discoverability. - - When improving `docs/ref/*` pages, treat the corresponding docstrings/comments in `src/` as the source of truth. Prefer updating those code comments so regenerated reference docs stay correct, instead of hand-editing the generated pages. - -5. Detect gaps and inaccuracies - - **Missing**: features/configs present in main but absent in docs. - - **Incorrect/outdated**: names, defaults, or behaviors that diverge from main. - - **Structural issues** (optional): pages overloaded, missing overviews, or mis-grouped topics. - -6. Produce a Docs Sync Report and ask for approval - - Provide a clear report with evidence, suggested doc locations, and proposed edits. - - Ask the user whether to proceed with doc updates. - -7. If approved, apply changes (English only) - - Edit only English docs in `docs/**`. - - Do **not** edit `docs/ja`, `docs/ko`, or `docs/zh`. - - Keep changes aligned with the existing docs style and navigation. - - Update `mkdocs.yml` when adding or renaming pages. - - Build docs with `make build-docs` after edits to verify the docs site still builds. - -## Output format - -Use this template when reporting findings: - -Docs Sync Report - -- Doc-first findings - - Page + missing content -> evidence + suggested insertion point -- Code-first gaps - - Feature + evidence -> suggested doc page/section (or missing page) -- Incorrect or outdated docs - - Doc file + issue + correct info + evidence -- Structural suggestions (optional) - - Proposed change + rationale -- Proposed edits - - Doc file -> concise change summary -- Questions for the user - -## References - -- `references/doc-coverage-checklist.md` diff --git a/.agents/skills/docs-sync/agents/openai.yaml b/.agents/skills/docs-sync/agents/openai.yaml deleted file mode 100644 index 145f6d99a5..0000000000 --- a/.agents/skills/docs-sync/agents/openai.yaml +++ /dev/null @@ -1,4 +0,0 @@ -interface: - display_name: "Docs Sync" - short_description: "Audit docs coverage and propose targeted updates" - default_prompt: "Use $docs-sync to audit the current branch against docs/ and propose targeted documentation updates." diff --git a/.agents/skills/docs-sync/references/doc-coverage-checklist.md b/.agents/skills/docs-sync/references/doc-coverage-checklist.md deleted file mode 100644 index 01d144c170..0000000000 --- a/.agents/skills/docs-sync/references/doc-coverage-checklist.md +++ /dev/null @@ -1,56 +0,0 @@ -# Doc Coverage Checklist - -Use this checklist to scan the selected scope (main = comprehensive, or current-branch diff) and validate documentation coverage. - -## Feature inventory targets - -- Public exports: classes, functions, types, and module entry points. -- Configuration options: `*Settings` types, default config objects, and builder patterns. -- Environment variables or runtime flags. -- CLI commands, scripts, and example entry points that define supported usage. -- User-facing behaviors: retry, timeouts, streaming, errors, logging, telemetry, and data handling. -- Deprecations, removals, or renamed settings. - -## Doc-first pass (page-by-page) - -- Review each relevant English page (excluding `docs/ja`, `docs/ko`, and `docs/zh`). -- Look for missing opt-in flags, env vars, or customization options that the page implies. -- Add new features that belong on that page based on user intent and navigation. - -## Code-first pass (feature inventory) - -- Map features to the closest existing page based on the docs navigation in `mkdocs.yml`. -- Prefer updating existing pages over creating new ones unless the topic is clearly new. -- Use conceptual pages for cross-cutting concerns (auth, errors, streaming, tracing, tools). -- Keep quick-start flows minimal; move advanced details into deeper pages. - -## Evidence capture - -- Record the main-branch file path and symbol/setting name. -- Note defaults or behavior-critical details for accuracy checks. -- Avoid large code dumps; a short identifier is enough. - -## Red flags for outdated or incorrect docs - -- Option names/types no longer exist or differ from code. -- Default values or allowed ranges do not match implementation. -- Features removed in code but still documented. -- New behaviors introduced without corresponding docs updates. - -## When to propose structural changes - -- A page mixes unrelated audiences (quick-start + deep reference) without clear separation. -- Multiple pages duplicate the same concept without cross-links. -- New feature areas have no obvious home in the nav structure. - -## Diff mode guidance (current branch vs main) - -- Focus only on changed behavior: new exports/options, modified defaults, removed features, or renamed settings. -- Use `git diff main...HEAD` (or equivalent) to constrain analysis. -- Document removals explicitly so docs can be pruned if needed. - -## Patch guidance - -- Keep edits scoped and aligned with existing tone and format. -- Update cross-links when moving or renaming sections. -- Leave translated docs untouched; English-only updates. diff --git a/.agents/skills/examples-auto-run/SKILL.md b/.agents/skills/examples-auto-run/SKILL.md deleted file mode 100644 index 4ecff71c9c..0000000000 --- a/.agents/skills/examples-auto-run/SKILL.md +++ /dev/null @@ -1,77 +0,0 @@ ---- -name: examples-auto-run -description: Run python examples in auto mode with logging, rerun helpers, and background control. ---- - -# examples-auto-run - -## What it does - -- Runs `uv run examples/run_examples.py` with: - - `EXAMPLES_INTERACTIVE_MODE=auto` (auto-input/auto-approve). - - Per-example logs under `.tmp/examples-start-logs/`. - - Main summary log path passed via `--main-log` (also under `.tmp/examples-start-logs/`). - - Generates a rerun list of failures at `.tmp/examples-rerun.txt` when `--write-rerun` is set. -- Provides start/stop/status/logs/tail/collect/rerun helpers via `run.sh`. -- Background option keeps the process running with a pidfile; `stop` cleans it up. - -## Usage - -```bash -# Start (auto mode; interactive included by default) -.agents/skills/examples-auto-run/scripts/run.sh start [extra args to run_examples.py] -# Examples: -.agents/skills/examples-auto-run/scripts/run.sh start --filter basic -.agents/skills/examples-auto-run/scripts/run.sh start --include-server --include-audio - -# Check status -.agents/skills/examples-auto-run/scripts/run.sh status - -# Stop running job -.agents/skills/examples-auto-run/scripts/run.sh stop - -# List logs -.agents/skills/examples-auto-run/scripts/run.sh logs - -# Tail latest log (or specify one) -.agents/skills/examples-auto-run/scripts/run.sh tail -.agents/skills/examples-auto-run/scripts/run.sh tail main_20260113-123000.log - -# Collect rerun list from a main log (defaults to latest main_*.log) -.agents/skills/examples-auto-run/scripts/run.sh collect - -# Rerun only failed entries from rerun file (auto mode) -.agents/skills/examples-auto-run/scripts/run.sh rerun -``` - -## Defaults (overridable via env) - -- `EXAMPLES_INTERACTIVE_MODE=auto` -- `EXAMPLES_INCLUDE_INTERACTIVE=1` -- `EXAMPLES_INCLUDE_SERVER=0` -- `EXAMPLES_INCLUDE_AUDIO=0` -- `EXAMPLES_INCLUDE_EXTERNAL=0` -- Auto-approvals in auto mode: `APPLY_PATCH_AUTO_APPROVE=1`, `SHELL_AUTO_APPROVE=1`, `AUTO_APPROVE_MCP=1` - -## Log locations - -- Main logs: `.tmp/examples-start-logs/main_*.log` -- Per-example logs (from `run_examples.py`): `.tmp/examples-start-logs/.log` -- Rerun list: `.tmp/examples-rerun.txt` -- Stdout logs: `.tmp/examples-start-logs/stdout_*.log` - -## Notes - -- The runner delegates to `uv run examples/run_examples.py`, which already writes per-example logs and supports `--collect`, `--rerun-file`, and `--print-auto-skip`. -- `start` uses `--write-rerun` so failures are captured automatically. -- If `.tmp/examples-rerun.txt` exists and is non-empty, invoking the skill with no args runs `rerun` by default. - -## Behavioral validation (Codex/LLM responsibility) - -The runner does not perform any automated behavioral validation. After every foreground `start` or `rerun`, **Codex must manually validate** all exit-0 entries: - -1. Read the example source (and comments) to infer intended flow, tools used, and expected key outputs. -2. Open the matching per-example log under `.tmp/examples-start-logs/`. -3. Confirm the intended actions/results occurred; flag omissions or divergences. -4. Do this for **all passed examples**, not just a sample. -5. Report immediately after the run with concise citations to the exact log lines that justify the validation. diff --git a/.agents/skills/examples-auto-run/agents/openai.yaml b/.agents/skills/examples-auto-run/agents/openai.yaml deleted file mode 100644 index bb9b66c695..0000000000 --- a/.agents/skills/examples-auto-run/agents/openai.yaml +++ /dev/null @@ -1,4 +0,0 @@ -interface: - display_name: "Examples Auto Run" - short_description: "Run examples in auto mode with logs and rerun helpers" - default_prompt: "Use $examples-auto-run to run the repo examples in auto mode, collect logs, and summarize any failures." diff --git a/.agents/skills/examples-auto-run/scripts/run.sh b/.agents/skills/examples-auto-run/scripts/run.sh deleted file mode 100755 index 5421a500cb..0000000000 --- a/.agents/skills/examples-auto-run/scripts/run.sh +++ /dev/null @@ -1,215 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../.." && pwd)" -PID_FILE="$ROOT/.tmp/examples-auto-run.pid" -LOG_DIR="$ROOT/.tmp/examples-start-logs" -RERUN_FILE="$ROOT/.tmp/examples-rerun.txt" - -ensure_dirs() { - mkdir -p "$LOG_DIR" "$ROOT/.tmp" -} - -is_running() { - local pid="$1" - [[ -n "$pid" ]] && ps -p "$pid" >/dev/null 2>&1 -} - -cmd_start() { - ensure_dirs - local background=0 - if [[ "${1:-}" == "--background" ]]; then - background=1 - shift - fi - - local ts main_log stdout_log - ts="$(date +%Y%m%d-%H%M%S)" - main_log="$LOG_DIR/main_${ts}.log" - stdout_log="$LOG_DIR/stdout_${ts}.log" - - local run_cmd=( - uv run examples/run_examples.py - --auto-mode - --write-rerun - --main-log "$main_log" - --logs-dir "$LOG_DIR" - ) - - if [[ "$background" -eq 1 ]]; then - if [[ -f "$PID_FILE" ]]; then - local pid - pid="$(cat "$PID_FILE" 2>/dev/null || true)" - if is_running "$pid"; then - echo "examples/run_examples.py already running (pid=$pid)." - exit 1 - fi - fi - ( - trap '' HUP - export EXAMPLES_INTERACTIVE_MODE="${EXAMPLES_INTERACTIVE_MODE:-auto}" - export APPLY_PATCH_AUTO_APPROVE="${APPLY_PATCH_AUTO_APPROVE:-1}" - export SHELL_AUTO_APPROVE="${SHELL_AUTO_APPROVE:-1}" - export AUTO_APPROVE_MCP="${AUTO_APPROVE_MCP:-1}" - export EXAMPLES_INCLUDE_INTERACTIVE="${EXAMPLES_INCLUDE_INTERACTIVE:-1}" - export EXAMPLES_INCLUDE_SERVER="${EXAMPLES_INCLUDE_SERVER:-0}" - export EXAMPLES_INCLUDE_AUDIO="${EXAMPLES_INCLUDE_AUDIO:-0}" - export EXAMPLES_INCLUDE_EXTERNAL="${EXAMPLES_INCLUDE_EXTERNAL:-0}" - cd "$ROOT" - exec "${run_cmd[@]}" "$@" > >(tee "$stdout_log") 2>&1 - ) & - local pid=$! - echo "$pid" >"$PID_FILE" - echo "Started run_examples.py (pid=$pid)" - echo "Main log: $main_log" - echo "Stdout log: $stdout_log" - echo "Run '.agents/skills/examples-auto-run/scripts/run.sh validate \"$main_log\"' after it finishes." - return 0 - fi - - export EXAMPLES_INTERACTIVE_MODE="${EXAMPLES_INTERACTIVE_MODE:-auto}" - export APPLY_PATCH_AUTO_APPROVE="${APPLY_PATCH_AUTO_APPROVE:-1}" - export SHELL_AUTO_APPROVE="${SHELL_AUTO_APPROVE:-1}" - export AUTO_APPROVE_MCP="${AUTO_APPROVE_MCP:-1}" - export EXAMPLES_INCLUDE_INTERACTIVE="${EXAMPLES_INCLUDE_INTERACTIVE:-1}" - export EXAMPLES_INCLUDE_SERVER="${EXAMPLES_INCLUDE_SERVER:-0}" - export EXAMPLES_INCLUDE_AUDIO="${EXAMPLES_INCLUDE_AUDIO:-0}" - export EXAMPLES_INCLUDE_EXTERNAL="${EXAMPLES_INCLUDE_EXTERNAL:-0}" - cd "$ROOT" - set +e - "${run_cmd[@]}" "$@" 2>&1 | tee "$stdout_log" - local run_status=${PIPESTATUS[0]} - set -e - return "$run_status" -} - -cmd_stop() { - if [[ ! -f "$PID_FILE" ]]; then - echo "No pid file; nothing to stop." - return 0 - fi - local pid - pid="$(cat "$PID_FILE" 2>/dev/null || true)" - if [[ -z "$pid" ]]; then - rm -f "$PID_FILE" - echo "Pid file empty; cleaned." - return 0 - fi - if ! is_running "$pid"; then - rm -f "$PID_FILE" - echo "Process $pid not running; cleaned pid file." - return 0 - fi - echo "Stopping pid $pid ..." - kill "$pid" 2>/dev/null || true - sleep 1 - if is_running "$pid"; then - echo "Sending SIGKILL to $pid ..." - kill -9 "$pid" 2>/dev/null || true - fi - rm -f "$PID_FILE" - echo "Stopped." -} - -cmd_status() { - if [[ -f "$PID_FILE" ]]; then - local pid - pid="$(cat "$PID_FILE" 2>/dev/null || true)" - if is_running "$pid"; then - echo "Running (pid=$pid)" - return 0 - fi - fi - echo "Not running." -} - -cmd_logs() { - ensure_dirs - ls -1t "$LOG_DIR" -} - -cmd_tail() { - ensure_dirs - local file="${1:-}" - if [[ -z "$file" ]]; then - file="$(ls -1t "$LOG_DIR" | head -n1)" - fi - if [[ -z "$file" ]]; then - echo "No log files yet." - exit 1 - fi - tail -f "$LOG_DIR/$file" -} - -collect_rerun() { - ensure_dirs - local log_file="${1:-}" - if [[ -z "$log_file" ]]; then - log_file="$(ls -1t "$LOG_DIR"/main_*.log 2>/dev/null | head -n1)" - fi - if [[ -z "$log_file" ]] || [[ ! -f "$log_file" ]]; then - echo "No main log file found." - exit 1 - fi - cd "$ROOT" - uv run examples/run_examples.py --collect "$log_file" --output "$RERUN_FILE" -} - -cmd_rerun() { - ensure_dirs - local file="${1:-$RERUN_FILE}" - if [[ ! -s "$file" ]]; then - echo "Rerun list is empty: $file" - exit 0 - fi - local ts main_log stdout_log - ts="$(date +%Y%m%d-%H%M%S)" - main_log="$LOG_DIR/main_${ts}.log" - stdout_log="$LOG_DIR/stdout_${ts}.log" - cd "$ROOT" - export EXAMPLES_INTERACTIVE_MODE="${EXAMPLES_INTERACTIVE_MODE:-auto}" - export APPLY_PATCH_AUTO_APPROVE="${APPLY_PATCH_AUTO_APPROVE:-1}" - export SHELL_AUTO_APPROVE="${SHELL_AUTO_APPROVE:-1}" - export AUTO_APPROVE_MCP="${AUTO_APPROVE_MCP:-1}" - set +e - uv run examples/run_examples.py --auto-mode --rerun-file "$file" --write-rerun --main-log "$main_log" --logs-dir "$LOG_DIR" 2>&1 | tee "$stdout_log" - local run_status=${PIPESTATUS[0]} - set -e - return "$run_status" -} - -usage() { - cat <<'EOF' -Usage: run.sh [args...] - -Commands: - start [--filter ... | other args] Run examples in auto mode (foreground). Pass --background to run detached. - stop Kill the running auto-run (if any). - status Show whether it is running. - logs List log files (.tmp/examples-start-logs). - tail [logfile] Tail the latest (or specified) log. - collect [main_log] Parse a main log and write failed examples to .tmp/examples-rerun.txt. - rerun [rerun_file] Run only the examples listed in .tmp/examples-rerun.txt. - -Environment overrides: - EXAMPLES_INTERACTIVE_MODE (default auto) - EXAMPLES_INCLUDE_SERVER/INTERACTIVE/AUDIO/EXTERNAL (defaults: 0/1/0/0) - APPLY_PATCH_AUTO_APPROVE, SHELL_AUTO_APPROVE, AUTO_APPROVE_MCP (default 1 in auto mode) -EOF -} - -default_cmd="start" -if [[ $# -eq 0 && -s "$RERUN_FILE" ]]; then - default_cmd="rerun" -fi - -case "${1:-$default_cmd}" in - start) shift || true; cmd_start "$@" ;; - stop) shift || true; cmd_stop ;; - status) shift || true; cmd_status ;; - logs) shift || true; cmd_logs ;; - tail) shift; cmd_tail "${1:-}" ;; - collect) shift || true; collect_rerun "${1:-}" ;; - rerun) shift || true; cmd_rerun "${1:-}" ;; - *) usage; exit 1 ;; -esac diff --git a/.agents/skills/final-release-review/SKILL.md b/.agents/skills/final-release-review/SKILL.md deleted file mode 100644 index bf2fa40bd6..0000000000 --- a/.agents/skills/final-release-review/SKILL.md +++ /dev/null @@ -1,126 +0,0 @@ ---- -name: final-release-review -description: Perform a release-readiness review by locating the previous release tag from remote tags and auditing the diff (e.g., v1.2.3...) for breaking changes, regressions, improvement opportunities, and risks before releasing openai-agents-python. ---- - -# Final Release Review - -## Purpose - -Use this skill when validating the latest release candidate commit (default tip of `origin/main`) for release. It guides you to fetch remote tags, pick the previous release tag, and thoroughly inspect the `BASE_TAG...TARGET` diff for breaking changes, introduced bugs/regressions, improvement opportunities, and release risks. - -The review must be stable and actionable: avoid variance between runs by using explicit gate rules, and never produce a `BLOCKED` call without concrete evidence and clear unblock actions. - -## Quick start - -1. Ensure repository root: `pwd` β†’ `path-to-workspace/openai-agents-python`. -2. Sync tags and pick base (default `v*`): - ```bash - BASE_TAG="$(.agents/skills/final-release-review/scripts/find_latest_release_tag.sh origin 'v*')" - ``` -3. Choose target commit (default tip of `origin/main`, ensure fresh): `git fetch origin main --prune` then `TARGET="$(git rev-parse origin/main)"`. -4. Snapshot scope: - ```bash - git diff --stat "${BASE_TAG}"..."${TARGET}" - git diff --dirstat=files,0 "${BASE_TAG}"..."${TARGET}" - git log --oneline --reverse "${BASE_TAG}".."${TARGET}" - git diff --name-status "${BASE_TAG}"..."${TARGET}" - ``` -5. Deep review using `references/review-checklist.md` to spot breaking changes, regressions, and improvement chances. -6. Capture findings and call the release gate: ship/block with conditions; propose focused tests for risky areas. - -## Deterministic gate policy - -- Default to **🟒 GREEN LIGHT TO SHIP** unless at least one blocking trigger below is satisfied. -- Use **πŸ”΄ BLOCKED** only when you can cite concrete release-blocking evidence and provide actionable unblock steps. -- Blocking triggers (at least one required for `BLOCKED`): - - A confirmed regression or bug introduced in `BASE...TARGET` (for example, failing targeted test, incompatible behavior in diff, or removed behavior without fallback). - - A confirmed breaking public API/protocol/config change with missing or mismatched versioning and no migration path (for example, patch release for a breaking change). - - A concrete data-loss, corruption, or security-impacting change with unresolved mitigation. - - A release-critical packaging/build/runtime path is broken by the diff (not speculative). -- Non-blocking by itself: - - Large diff size, broad refactor, or many touched files. - - "Could regress" risk statements without concrete evidence. - - Not running tests locally. -- If evidence is incomplete, issue **🟒 GREEN LIGHT TO SHIP** with targeted validation follow-ups instead of `BLOCKED`. - -## Workflow - -- **Prepare** - - Run the quick-start tag command to ensure you use the latest remote tag. If the tag pattern differs, override the pattern argument (e.g., `'*.*.*'`). - - If the user specifies a base tag, prefer it but still fetch remote tags first. - - Keep the working tree clean to avoid diff noise. -- **Assumptions** - - Assume the target commit (default `origin/main` tip) has already passed `$code-change-verification` in CI unless the user says otherwise. - - Do not block a release solely because you did not run tests locally; focus on concrete behavioral or API risks. - - Release policy: routine releases use patch versions; use minor only for breaking changes or major feature additions. Major versions are reserved until the 1.0 release. -- **Map the diff** - - Use `--stat`, `--dirstat`, and `--name-status` outputs to spot hot directories and file types. - - For suspicious files, prefer `git diff --word-diff BASE...TARGET -- `. - - Note any deleted or newly added tests, config, migrations, or scripts. -- **Analyze risk** - - Walk through the categories in `references/review-checklist.md` (breaking changes, regression clues, improvement opportunities). - - When you suspect a risk, cite the specific file/commit and explain the behavioral impact. - - For every finding, include all of: `Evidence`, `Impact`, and `Action`. - - Severity calibration: - - **🟒 LOW**: low blast radius or clearly covered behavior; no release gate impact. - - **🟑 MODERATE**: plausible user-facing regression signal; needs validation but not a confirmed blocker. - - **πŸ”΄ HIGH**: confirmed or strongly evidenced release-blocking issue. - - Suggest minimal, high-signal validation commands (targeted tests or linters) instead of generic reruns when time is tight. - - Breaking changes do not automatically require a BLOCKED release call when they are already covered by an appropriate version bump and migration/upgrade notes; only block when the bump is missing/mismatched (e.g., patch bump) or when the breaking change introduces unresolved risk. -- **Form a recommendation** - - State BASE_TAG and TARGET explicitly. - - Provide a concise diff summary (key directories/files and counts). - - List: breaking-change candidates, probable regressions/bugs, improvement opportunities, missing release notes/migrations. - - Recommend ship/block and the exact checks needed to unblock if blocking. If a breaking change is properly versioned (minor/major), you may still recommend a GREEN LIGHT TO SHIP while calling out the change. Use emoji and boldface in the release call to make the gate obvious. - - If you cannot provide a concrete unblock checklist item, do not use `BLOCKED`. - -## Output format (required) - -All output must be in English. - -Use the following report structure in every response produced by this skill. Be proactive and decisive: make a clear ship/block call near the top, and assign an explicit risk level (LOW/MODERATE/HIGH) to each finding with a short impact statement. Avoid overly cautious hedging when the risk is low and tests passed. - -Always use the fixed repository URL in the Diff section (`https://github.com/openai/openai-agents-python/compare/...`). Do not use `${GITHUB_REPOSITORY}` or any other template variable. Format risk levels as bold emoji labels: **🟒 LOW**, **🟑 MODERATE**, **πŸ”΄ HIGH**. - -Every risk finding must contain an actionable next step. If the report uses `**πŸ”΄ BLOCKED**`, include an `Unblock checklist` section with at least one concrete command/task and a pass condition. - -``` -### Release readiness review ( -> TARGET ) - -This is a release readiness report done by `$final-release-review` skill. - -### Diff - -https://github.com/openai/openai-agents-python/compare/... - -### Release call: -**<🟒 GREEN LIGHT TO SHIP | πŸ”΄ BLOCKED>** - -### Scope summary: -- - -### Risk assessment (ordered by impact): -1) **** - - Risk: **<🟒 LOW | 🟑 MODERATE | πŸ”΄ HIGH>**. - - Evidence: - - Files: - - Action: -2) ... - -### Unblock checklist (required when Release call is BLOCKED): -1. [ ] - - Exit criteria: -2. ... - -### Notes: -- -``` - -If no risks are found, include a β€œNo material risks identified” line under Risk assessment and still provide a ship call. If you did not run local verification, do not add a verification status section or use it as a release blocker; note any assumptions briefly in Notes. -If the report is not blocked, omit the `Unblock checklist` section. - -### Resources - -- `scripts/find_latest_release_tag.sh`: Fetches remote tags and returns the newest tag matching a pattern (default `v*`). -- `references/review-checklist.md`: Detailed signals and commands for spotting breaking changes, regressions, and release polish gaps. diff --git a/.agents/skills/final-release-review/agents/openai.yaml b/.agents/skills/final-release-review/agents/openai.yaml deleted file mode 100644 index 1c09487791..0000000000 --- a/.agents/skills/final-release-review/agents/openai.yaml +++ /dev/null @@ -1,4 +0,0 @@ -interface: - display_name: "Final Release Review" - short_description: "Audit a release candidate against the previous tag" - default_prompt: "Use $final-release-review to audit the release candidate diff against the previous release tag and call the ship/block gate." diff --git a/.agents/skills/final-release-review/references/review-checklist.md b/.agents/skills/final-release-review/references/review-checklist.md deleted file mode 100644 index 3cd5d4d2a6..0000000000 --- a/.agents/skills/final-release-review/references/review-checklist.md +++ /dev/null @@ -1,65 +0,0 @@ -# Release Diff Review Checklist - -## Quick commands - -- Sync tags: `git fetch origin --tags --prune`. -- Identify latest release tag (default pattern `v*`): `git tag -l 'v*' --sort=-v:refname | head -n1` or use `.agents/skills/final-release-review/scripts/find_latest_release_tag.sh`. -- Generate overview: `git diff --stat BASE...TARGET`, `git diff --dirstat=files,0 BASE...TARGET`, `git log --oneline --reverse BASE..TARGET`. -- Inspect risky files quickly: `git diff --name-status BASE...TARGET`, `git diff --word-diff BASE...TARGET -- `. - -## Gate decision matrix - -- Choose `🟒 GREEN LIGHT TO SHIP` when no concrete blocking trigger is found. -- Choose `πŸ”΄ BLOCKED` only when at least one blocking trigger has concrete evidence and a defined unblock action. -- Blocking triggers: - - Confirmed regression/bug introduced in the diff. - - Confirmed breaking public API/protocol/config change with missing or mismatched versioning/migration path. - - Concrete data-loss/corruption/security-impacting issue with unresolved mitigation. - - Release-critical build/package/runtime break introduced by the diff. -- Non-blocking by itself: - - Large refactor or high file count. - - Speculative risk without evidence. - - Not running tests locally. -- If uncertain, keep gate green and provide focused follow-up checks. - -## Actionability contract - -- Every risk finding should include: - - `Evidence`: specific file/commit/diff/test signal. - - `Impact`: one-sentence user or runtime effect. - - `Action`: concrete command/task with pass criteria. -- A `BLOCKED` report must contain an `Unblock checklist` with at least one executable item. -- If no executable unblock item exists, do not block; downgrade to green with follow-up checks. - -## Breaking change signals - -- Public API surface: removed/renamed modules, classes, functions, or re-exports; changed parameters/return types, default values changed, new required options, stricter validation. -- Protocol/schema: request/response fields added/removed/renamed, enum changes, JSON shape changes, ID formats, pagination defaults. -- Config/CLI/env: renamed flags, default behavior flips, removed fallbacks, environment variable changes, logging levels tightened. -- Dependencies/platform: Python version requirement changes, dependency major bumps, `pyproject.toml`/`uv.lock` changes, removed or renamed extras. -- Persistence/data: migration scripts missing, data model changes, stored file formats, cache keys altered without invalidation. -- Docs/examples drift: examples still reflect old behavior or lack migration note. - -## Regression risk clues - -- Large refactors with light test deltas or deleted tests; new `skip`/`todo` markers. -- Concurrency/timing: new async flows, asyncio event-loop changes, retries, timeouts, debounce/caching changes, race-prone patterns. -- Error handling: catch blocks removed, swallowed errors, broader catch-all added without logging, stricter throws without caller updates. -- Stateful components: mutable shared state, global singletons, lifecycle changes (init/teardown), resource cleanup removal. -- Third-party changes: swapped core libraries, feature flags toggled, observability removed or gated. - -## Improvement opportunities - -- Missing coverage for new code paths; add focused tests. -- Performance: obvious N+1 loops, repeated I/O without caching, excessive serialization. -- Developer ergonomics: unclear naming, missing inline docs for public APIs, missing examples for new features. -- Release hygiene: add migration/upgrade note when behavior changes; ensure changelog/notes capture user-facing shifts. - -## Evidence to capture in the review output - -- BASE tag and TARGET ref used for the diff; confirm tags fetched. -- High-level diff stats and key directories touched. -- Concrete files/commits that indicate breaking changes or risk, with brief rationale. -- Tests or commands suggested to validate suspected risks (include pass criteria). -- Explicit release gate call (ship/block) with conditions to unblock. -- `Unblock checklist` section when (and only when) gate is `BLOCKED`. diff --git a/.agents/skills/final-release-review/scripts/find_latest_release_tag.sh b/.agents/skills/final-release-review/scripts/find_latest_release_tag.sh deleted file mode 100755 index f36ae497b0..0000000000 --- a/.agents/skills/final-release-review/scripts/find_latest_release_tag.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -remote="${1:-origin}" -pattern="${2:-v*}" - -# Sync tags from the remote to ensure the latest release tag is available locally. -git fetch "$remote" --tags --prune --quiet - -latest_tag=$(git tag -l "$pattern" --sort=-v:refname | head -n1) - -if [[ -z "$latest_tag" ]]; then - echo "No tags found matching pattern '$pattern' after fetching from $remote." >&2 - exit 1 -fi - -echo "$latest_tag" diff --git a/.agents/skills/implementation-strategy/SKILL.md b/.agents/skills/implementation-strategy/SKILL.md deleted file mode 100644 index 55a800c28f..0000000000 --- a/.agents/skills/implementation-strategy/SKILL.md +++ /dev/null @@ -1,52 +0,0 @@ ---- -name: implementation-strategy -description: Decide how to implement runtime and API changes in openai-agents-python before editing code. Use when a task changes exported APIs, runtime behavior, serialized state, tests, or docs and you need to choose the compatibility boundary, whether shims or migrations are warranted, and when unreleased interfaces can be rewritten directly. ---- - -# Implementation Strategy - -## Overview - -Use this skill before editing code when the task changes runtime behavior or anything that might look like a compatibility concern. The goal is to keep implementations simple while protecting real released contracts. - -## Quick start - -1. Identify the surface you are changing: released public API, unreleased branch-local API, internal helper, persisted schema, wire protocol, CLI/config/env surface, or docs/examples only. -2. Determine the latest release boundary: - ```bash - BASE_TAG="$(.agents/skills/final-release-review/scripts/find_latest_release_tag.sh origin 'v*' 2>/dev/null || git tag -l 'v*' --sort=-v:refname | head -n1)" - echo "$BASE_TAG" - ``` -3. Judge breaking-change risk against that latest release tag, not against unreleased branch churn or post-tag changes already on `main`. -4. Prefer the simplest implementation that satisfies the current task. Update callers, tests, docs, and examples directly instead of preserving superseded unreleased interfaces. -5. Add a compatibility layer only when there is a concrete released consumer or an explicitly supported durable external state boundary that requires it, or when the user explicitly asks for a migration path. - -## Compatibility boundary rules - -- Released public API or documented external behavior: preserve compatibility or provide an explicit migration path. -- Persisted schema, serialized state, wire protocol, CLI flags, environment variables, and externally consumed config: treat as compatibility-sensitive when they are part of the latest release or when the repo explicitly intends to preserve them across commits, processes, or machines. -- Python-specific durable surfaces such as `RunState`, session persistence, exported dataclass constructor order, and documented model/provider configuration should be treated as compatibility-sensitive when they were part of the latest release tag or are explicitly supported as a shared durability boundary. -- Interface changes introduced only on the current branch: not a compatibility target. Rewrite them directly. -- Interface changes present on `main` but added after the latest release tag: not a semver breaking change by themselves. Rewrite them directly unless they already define a released or explicitly supported durable external state boundary. -- Internal helpers, private types, same-branch tests, fixtures, and examples: update them directly instead of adding adapters. -- Unreleased persisted schema versions on `main` may be renumbered or squashed before release when intermediate snapshots are intentionally unsupported. When you do that, update the support set and tests together so the boundary is explicit. - -## Default implementation stance - -- Prefer deletion or replacement over aliases, overloads, shims, feature flags, and dual-write logic when the old shape is unreleased. -- Do not preserve a confusing abstraction just because it exists in the current branch diff. -- If review feedback claims a change is breaking, verify it against the latest release tag and actual external impact before accepting the feedback. -- If a change truly crosses the latest released contract boundary, call that out explicitly in the ExecPlan, release notes context, and user-facing summary. - -## When to stop and confirm - -- The change would alter behavior shipped in the latest release tag. -- The change would modify durable external data, protocol formats, or serialized state. -- The user explicitly asked for backward compatibility, deprecation, or migration support. - -## Output expectations - -When this skill materially affects the implementation approach, state the decision briefly in your reasoning or handoff, for example: - -- `Compatibility boundary: latest release tag v0.x.y; branch-local interface rewrite, no shim needed.` -- `Compatibility boundary: released RunState schema; preserve compatibility and add migration coverage.` diff --git a/.agents/skills/implementation-strategy/agents/openai.yaml b/.agents/skills/implementation-strategy/agents/openai.yaml deleted file mode 100644 index 9a64342d19..0000000000 --- a/.agents/skills/implementation-strategy/agents/openai.yaml +++ /dev/null @@ -1,4 +0,0 @@ -interface: - display_name: "Implementation Strategy" - short_description: "Choose a compatibility-aware implementation plan" - default_prompt: "Use $implementation-strategy to choose the implementation approach and compatibility boundary before editing runtime code." diff --git a/.agents/skills/openai-knowledge/SKILL.md b/.agents/skills/openai-knowledge/SKILL.md deleted file mode 100644 index f223568bfa..0000000000 --- a/.agents/skills/openai-knowledge/SKILL.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -name: openai-knowledge -description: Use when working with the OpenAI API (Responses API) or OpenAI platform features (tools, streaming, Realtime API, auth, models, rate limits, MCP) and you need authoritative, up-to-date documentation (schemas, examples, limits, edge cases). Prefer the OpenAI Developer Documentation MCP server tools when available; otherwise guide the user to enable `openaiDeveloperDocs`. ---- - -# OpenAI Knowledge - -## Overview - -Use the OpenAI Developer Documentation MCP server to search and fetch exact docs (markdown), then base your answer on that text instead of guessing. - -## Workflow - -### 1) Check whether the Docs MCP server is available - -If the `mcp__openaiDeveloperDocs__*` tools are available, use them. - -If you are unsure, run `codex mcp list` and check for `openaiDeveloperDocs`. - -### 2) Use MCP tools to pull exact docs - -- Search first, then fetch the specific page or pages. - - `mcp__openaiDeveloperDocs__search_openai_docs` β†’ pick the best URL. - - `mcp__openaiDeveloperDocs__fetch_openai_doc` β†’ retrieve the exact markdown (optionally with an `anchor`). -- When you need endpoint schemas or parameters, use: - - `mcp__openaiDeveloperDocs__get_openapi_spec` - - `mcp__openaiDeveloperDocs__list_api_endpoints` - -Base your answer on the fetched text and quote or paraphrase it precisely. Do not invent flags, field names, defaults, or limits. - -### 3) If MCP is not configured, guide setup (do not change config unless asked) - -Provide one of these setup options, then ask the user to restart the Codex session so the tools load: - -- CLI: - - `codex mcp add openaiDeveloperDocs --url https://developers.openai.com/mcp` -- Config file (`~/.codex/config.toml`): - - Add: - ```toml - [mcp_servers.openaiDeveloperDocs] - url = "https://developers.openai.com/mcp" - ``` - -Also point to: https://developers.openai.com/resources/docs-mcp#quickstart diff --git a/.agents/skills/openai-knowledge/agents/openai.yaml b/.agents/skills/openai-knowledge/agents/openai.yaml deleted file mode 100644 index 5012167865..0000000000 --- a/.agents/skills/openai-knowledge/agents/openai.yaml +++ /dev/null @@ -1,4 +0,0 @@ -interface: - display_name: "OpenAI Knowledge" - short_description: "Pull authoritative OpenAI platform documentation" - default_prompt: "Use $openai-knowledge to fetch the exact OpenAI docs needed for this API or platform question." diff --git a/.agents/skills/pr-draft-summary/SKILL.md b/.agents/skills/pr-draft-summary/SKILL.md deleted file mode 100644 index 79f2800a59..0000000000 --- a/.agents/skills/pr-draft-summary/SKILL.md +++ /dev/null @@ -1,58 +0,0 @@ ---- -name: pr-draft-summary -description: Create a PR title and draft description after substantive code changes are finished. Trigger when wrapping up a moderate-or-larger change (runtime code, tests, build config, docs with behavior impact) and you need the PR-ready summary block with change summary plus PR draft text. ---- - -# PR Draft Summary - -## Purpose -Produce the PR-ready summary required in this repository after substantive code work is complete: a concise summary plus a PR-ready title and draft description that begins with "This pull request ...". The block should be ready to paste into a PR for openai-agents-python. - -## When to Trigger -- The task for this repo is finished (or ready for review) and it touched runtime code, tests, examples, docs with behavior impact, or build/test configuration. -- You are about to send the "work complete" response and need the PR block included. -- Skip only for trivial or conversation-only tasks where no PR-style summary is expected. - -## Inputs to Collect Automatically (do not ask the user) -- Current branch: `git rev-parse --abbrev-ref HEAD`. -- Working tree: `git status -sb`. -- Untracked files: `git ls-files --others --exclude-standard` (use with `git status -sb` to ensure they are surfaced; `--stat` does not include them). -- Changed files: `git diff --name-only` (unstaged) and `git diff --name-only --cached` (staged); sizes via `git diff --stat` and `git diff --stat --cached`. -- Latest release tag (prefer remote-aware lookup): `LATEST_RELEASE_TAG=$(.agents/skills/final-release-review/scripts/find_latest_release_tag.sh origin 'v*' 2>/dev/null || git tag -l 'v*' --sort=-v:refname | head -n1)`. -- Base reference (use the branch's upstream, fallback to `origin/main`): - - `BASE_REF=$(git rev-parse --abbrev-ref --symbolic-full-name @{upstream} 2>/dev/null || echo origin/main)`. - - `BASE_COMMIT=$(git merge-base --fork-point "$BASE_REF" HEAD || git merge-base "$BASE_REF" HEAD || echo "$BASE_REF")`. -- Commits ahead of the base fork point: `git log --oneline --no-merges ${BASE_COMMIT}..HEAD`. -- Category signals for this repo: runtime (`src/agents/`), tests (`tests/`), examples (`examples/`), docs (`docs/`, `mkdocs.yml`), build/test config (`pyproject.toml`, `uv.lock`, `Makefile`, `.github/`). - -## Workflow -1) Run the commands above without asking the user; compute `BASE_REF`/`BASE_COMMIT` first so later commands reuse them. -2) If there are no staged/unstaged/untracked changes and no commits ahead of `${BASE_COMMIT}`, reply briefly that no code changes were detected and skip emitting the PR block. -3) Infer change type from the touched paths listed under "Category signals"; classify as feature, fix, refactor, or docs-with-impact, and flag backward-compatibility risk only when the diff changes released public APIs, external config, persisted data, serialized state, or wire protocols. Judge that risk against `LATEST_RELEASE_TAG`, not unreleased branch-only churn. -4) Summarize changes in 1–3 short sentences using the key paths (top 5) and `git diff --stat` output; explicitly call out untracked files from `git status -sb`/`git ls-files --others --exclude-standard` because `--stat` does not include them. If the working tree is clean but there are commits ahead of `${BASE_COMMIT}`, summarize using those commit messages. -5) Choose the lead verb for the description: feature β†’ `adds`, bug fix β†’ `fixes`, refactor/perf β†’ `improves` or `updates`, docs-only β†’ `updates`. -6) Suggest a branch name. If already off main, keep it; otherwise propose `feat/`, `fix/`, or `docs/` based on the primary area (e.g., `docs/pr-draft-summary-guidance`). -7) If the current branch matches `issue-` (digits only), keep that branch suggestion. Optionally pull light issue context (for example via the GitHub API) when available, but do not block or retry if it is not. When an issue number is present, reference `https://github.com/openai/openai-agents-python/issues/` and include an auto-closing line such as `This pull request resolves #.`. -8) Draft the PR title and description using the template below. -9) Output only the block in "Output Format". Keep any surrounding status note minimal and in English. - -## Output Format -When closing out a task and the summary block is desired, add this concise Markdown block (English only) after any brief status note. If the user says they do not want it, skip this section. - -``` -# Pull Request Draft - -## Branch name suggestion - -git checkout -b - -## Title - - - -## Description - - -``` - -Keep it tightβ€”no redundant prose around the block, and avoid repeating details between `Changes` and the description. Tests do not need to be listed unless specifically requested. diff --git a/.agents/skills/pr-draft-summary/agents/openai.yaml b/.agents/skills/pr-draft-summary/agents/openai.yaml deleted file mode 100644 index 572ac1f62f..0000000000 --- a/.agents/skills/pr-draft-summary/agents/openai.yaml +++ /dev/null @@ -1,4 +0,0 @@ -interface: - display_name: "PR Draft Summary" - short_description: "Draft the repo-ready PR title and description" - default_prompt: "Use $pr-draft-summary to generate the PR-ready summary block, title, and draft description for the current changes." diff --git a/.agents/skills/runtime-behavior-probe/SKILL.md b/.agents/skills/runtime-behavior-probe/SKILL.md deleted file mode 100644 index f98dc12e49..0000000000 --- a/.agents/skills/runtime-behavior-probe/SKILL.md +++ /dev/null @@ -1,160 +0,0 @@ ---- -name: runtime-behavior-probe -description: Plan and execute runtime-behavior investigations with temporary probe scripts, validation matrices, state controls, and findings-first reports. Use only when the user explicitly invokes this skill to verify actual runtime behavior beyond normal code-level checks, especially to uncover edge cases, undocumented behavior, or common failure modes in local or live integrations. A baseline smoke check is fine as an entry point, but do not stop at happy-path confirmation. ---- - -# Runtime Behavior Probe - -## Overview - -Use this skill to investigate real runtime behavior, not to restate code or documentation. Start by planning the investigation, then execute a case matrix, record observed behavior, and report both the findings and the method used to obtain them. - -## Core Rules - -- Treat this skill as manual-only. Do not rely on implicit invocation. -- A baseline success or smoke case is often the right entry point, but do not stop there when the real question involves edge cases, drift, or failure behavior. -- Plan before running anything. Write the case matrix first, then fill it in with observed results. The matrix can live in a scratch note, a temporary file, or the probe script header. -- Default to local or read-only probes. Consider a live service only when it is clearly relevant, then apply the lightweight gates below before you run it. -- Size the probe to the decision. Start with the smallest matrix that can disqualify or validate the current hypothesis, then expand only when uncertainty remains. -- Before a live probe, apply three lightweight gates: - - Destination gate. Use only a live destination that is clearly allowed for the task. - - Intent gate. Run the live probe only when the user explicitly wants runtime verification on that integration, or explicitly approves it after you propose the probe. - - Data gate. If the probe will read environment variables, mutate remote state, incur material cost, or exercise non-public or user data, name the exact variable names or data class and get explicit approval first. -- Classify each case as read-only, mutating, or costly before execution. For mutating or costly cases, or for any live case that will read environment variables, define cleanup or rollback before running the probe. -- Use temporary files or a temporary directory for one-off probe scripts. -- Keep temporary artifacts until the final response is drafted. Then delete them by default unless the user asked to keep them or they are needed for follow-up. Even when artifacts are deleted, keep a short run summary of the command shape, runtime context, and artifact status in the report. -- Before executing a live probe that will read environment variables, tell the user the exact variable names you plan to use and why, then wait for explicit approval. Examples include `OPENAI_API_KEY` and other expected default names for the system under test. -- Never print secrets, even when they come from standard environment variables that this skill may use. -- For OpenAI API or OpenAI platform probes in this repository, use [$openai-knowledge](../openai-knowledge/SKILL.md) early to confirm contract-sensitive details such as supported parameters, field names, and limits. Use runtime probing to validate or challenge the documented behavior, not to skip the documentation pass entirely. If the docs MCP is unavailable, fall back to the official OpenAI docs and say that you used the fallback in the report. -- For benchmark or comparison probes, make parity explicit before execution. Record what is held constant, what variable is under test, which response-shape constraints keep the comparison fair, and any usage or token counters that matter for interpreting latency or cost. -- For OpenAI hosted tool probes, remove setup ambiguity before attributing a negative result to runtime behavior: - - Force the tool path with the matching `tool_choice` when the question depends on tool invocation. - - Treat `container_auto` and `container_reference` as separate cases, not interchangeable setup details. - - Clear unsupported model or tool options first so they do not invalidate the probe. - -## Workflow - -1. Restate the investigation target in operational terms. Name the runtime surface, the key uncertainty, and the highest-risk behaviors to test. -2. Do a short preflight. Check the relevant code or docs first, decide whether the question needs local or live validation, and note any repo, baseline, or release boundary that matters. -3. Create a validation matrix before executing probes. Cover both baseline behavior and the most relevant failure or drift cases. The matrix can live in a scratch note, a temporary file, or a structured header inside the probe script. -4. For each case, choose an execution mode up front: - - `single-shot` for deterministic one-run checks. - - `repeat-N` for cache, retry, streaming, interruption, rate-limit, concurrency, or other run-to-run-sensitive behavior. - - `warm-up + repeat-N` when first-run cold-start effects could distort the result. - Use these defaults unless the task clearly needs something else: - - Quick screen of a repeat-sensitive question: `repeat-3`. - - Decision-grade latency or release recommendation: `warm-up + repeat-10`. - - Costly live cases: start at `repeat-3`, then expand only if the answer remains unclear. - If it is genuinely unclear whether extra runs are worth the time or cost, ask the user before expanding the probe. -5. When the question is benchmark-like or comparative, run in phases. Start with a high-signal pilot matrix against a control, then expand only the surviving candidates or unresolved cases. -6. If the question is about a suspected regression or behavior change, add at least one known-good control case such as `origin/main`, the latest release, or the same request without the suspected option. -7. For comparative probes, define parity before execution. Record prompt or input shape, tool-choice setup, model-settings parity, state reuse rules, and any response-shape constraint that keeps the comparison fair. If materially different output length could bias the result, record usage or token notes too. -8. If the question asks whether one option has the same intelligence or quality as another, decide whether the matrix supports only example-pattern parity or a broader quality claim. For broader claims, add at least one harder or more open-ended case. Otherwise say explicitly that the result is limited to the covered patterns. -9. Plan state controls before execution when hidden state could affect the result. Record whether each case uses fresh or reused state, how cache reuse or cache busting is handled, what unique IDs isolate repeated runs, and how cleanup is verified. -10. If any live case will read environment variables, list the exact variable names and purpose for each case, then ask the user for approval before execution. Keep the approval ask short and include destination, read-only versus mutating or costly risk, exact variable names, and cleanup or rollback if relevant. -11. Build task-specific probe scripts in a temporary location. Keep the script small, observable, and easy to discard. -12. In `openai-agents-python`, make the runtime context explicit: - - Run Python probes from the repository root with `uv run python` when practical. - - Record the current commit, working directory, Python executable, and Python version. - - Avoid accidental imports from a different checkout or site-packages location. If you must deviate from `uv run python`, say exactly why and what interpreter or environment was used instead. -13. Execute the matrix and capture evidence. Record request shape, setup, observation summary, unexpected or negative result, error details, timing, runtime context, approved environment-variable names, repeat counts, warm-up handling, variance when relevant, cleanup behavior, and for comparisons note what was held constant plus any response-shape or usage notes that affect interpretation. -14. Update the matrix with actual outcomes, not guesses. -15. Keep temporary artifacts until the final response is drafted. Then delete them unless the user asked to keep them or they are needed for follow-up. Benchmark and repeat-heavy probes often need follow-up, so keeping artifacts is normal when the result may be revisited. If deleted, retain and report a short run summary. -16. Report findings first, with unexpected or negative findings first. Then summarize how the validation was performed and which cases were covered. -17. If the probe isolates one clear defect, you may include a short implementation hypothesis or minimal repro direction. Do not expand into a larger next-step plan unless the user asked for it. - -## Validation Matrix - -Use a matrix that makes the news easy to scan. Start from the runtime question and the observation summary, not just from `expected` and `pass` or `fail`. - -Use a matrix with at least these columns: - -- `case_id` -- `scenario` -- `mode` -- `question` -- `setup` -- `observation_summary` -- `result_flag` -- `evidence` - -Add these columns when they materially improve the investigation: - -- `comparison_basis` -- `variable_under_test` -- `held_constant` -- `output_constraint` -- `status` -- `confidence` -- `state_setup` -- `repeats` -- `warm_up` -- `variance` -- `usage_note` -- `risk_profile` -- `env_vars` -- `approval` -- `control` - -Treat `result_flag` as a fast scan field such as `unexpected`, `negative`, `expected`, or `blocked`. Use `status` only when there is a credible comparison basis, baseline, or documented contract to compare against. - -Always consider whether the matrix should include these categories: - -- Baseline success. -- Control or baseline comparison when a regression is suspected. -- Boundary input or parameter variation. -- Invalid or unsupported input. -- Missing or incorrect configuration. -- Transient external failure such as timeout, network interruption, or rate limiting. -- Retry, idempotence, or cleanup behavior. -- Concurrency or overlapping operations when shared state or ordering may matter. -- Open-ended quality or intelligence samples when the question is broader than pattern parity. - -Open [validation-matrix.md](./references/validation-matrix.md) when you need a stronger prioritization model or a reusable case template. - -## Temporary Probe Scripts - -Write one-off scripts in a temporary file or temporary directory such as one created by `mktemp -d` or Python `tempfile`. Keep the script outside the repository by default, even when it imports code from the repository. - -If the probe needs repository code: - -- Run it with the repository as the working directory, or -- Set `PYTHONPATH` or the equivalent import path explicitly. -- In `openai-agents-python`, prefer `uv run python /tmp/probe.py` from the repository root. - -Design the probe to maximize observability: - -- Print or log the exact scenario being exercised. -- Capture runtime context such as git SHA, working directory, Python executable and version, relevant package versions, model or deployment name, endpoint or base URL alias, and any retry or tool options that materially affect behavior. -- For live probes, record only the names of environment variables that were approved for use. Never print their values. -- Capture structured outputs when possible. -- Preserve raw error type, message, and status code. -- For repeat-sensitive cases, capture the attempt index, warm-up status, and any stable identifiers that help compare runs. -- For repeated or benchmark-style probes, write both raw results and a compact summary artifact when practical. -- Keep branching minimal so each script answers a narrow question. - -Before deleting the temporary script or directory, keep a short run summary of the script path, command used, runtime context, and whether the evidence was kept or deleted. - -Open [python_probe.py](./templates/python_probe.py) when you want a lightweight disposable Python probe scaffold. - -## Reporting - -Report in this order: - -1. Findings. Put unexpected or negative findings first. If there was no real news, say that explicitly. -2. Validation approach. Summarize the code used, the runtime surface exercised, the execution modes, and the case matrix coverage. -3. Case results. Include the matrix or a condensed version of it when the case count is large. -4. Artifact status and brief run summary. State whether temporary artifacts were deleted or kept, and provide kept paths or the retained summary. -5. Optional implementation note. Include this only when one clear defect was isolated and a short implementation direction would help. - -For comparative probes, the report should also say what was held constant, what variable was under test, and whether the result supports only pattern parity or a broader quality claim. - -Open [reporting-format.md](./references/reporting-format.md) for the recommended response template. - -## Resources - -- Open [validation-matrix.md](./references/validation-matrix.md) to design and prioritize the case matrix. -- Open [error-cases.md](./references/error-cases.md) to expand common failure scenarios. -- Open [openai-runtime-patterns.md](./references/openai-runtime-patterns.md) for recurring OpenAI and Responses API probe patterns. -- Open [reporting-format.md](./references/reporting-format.md) for the final report structure. -- Open [python_probe.py](./templates/python_probe.py) for a minimal disposable Python probe scaffold. diff --git a/.agents/skills/runtime-behavior-probe/agents/openai.yaml b/.agents/skills/runtime-behavior-probe/agents/openai.yaml deleted file mode 100644 index fd7635d397..0000000000 --- a/.agents/skills/runtime-behavior-probe/agents/openai.yaml +++ /dev/null @@ -1,6 +0,0 @@ -interface: - display_name: "Runtime Behavior Probe" - short_description: "Plan and run runtime behavior probes" - default_prompt: "Use $runtime-behavior-probe to investigate actual runtime behavior with a validation matrix, explicit state controls, and a findings-first report." -policy: - allow_implicit_invocation: false diff --git a/.agents/skills/runtime-behavior-probe/references/error-cases.md b/.agents/skills/runtime-behavior-probe/references/error-cases.md deleted file mode 100644 index 66f713992d..0000000000 --- a/.agents/skills/runtime-behavior-probe/references/error-cases.md +++ /dev/null @@ -1,80 +0,0 @@ -# Common Error Cases - -Use this reference to expand beyond the happy path. Favor error cases that a real user or operator is likely to hit. - -## Configuration Errors - -Check whether the runtime behaves differently for: - -- Missing required environment variables. -- Present but malformed secrets or identifiers. -- Wrong endpoint or base URL. -- Wrong model or deployment name. -- Incompatible local dependency versions. - -Look for: - -- Error type and status code. -- Whether the failure is immediate or delayed. -- Whether the message is actionable. -- Whether retrying without fixing configuration changes anything. - -## Input Errors - -Probe common bad-input patterns such as: - -- Missing required fields. -- Wrong data type. -- Unsupported enum or option value. -- Empty but syntactically valid input. -- Oversized input or too many items. -- Mutually incompatible options. - -Prefer realistic invalid inputs over artificial nonsense. The point is to learn how the runtime fails in practice. - -## Transport and Availability Errors - -When networked services are involved, consider: - -- Connection failure. -- Read timeout. -- Server timeout or upstream gateway error. -- Rate limit response. -- Partial stream interruption. -- Reusing a connection after a failure. - -Capture whether the client library retries automatically, whether it surfaces retry metadata, and whether the final exception preserves the original cause. - -## State and Repetition Errors - -Many surprising bugs appear only when an operation is repeated or interrupted: - -- Re-submit the same request. -- Repeat after a timeout. -- Retry after a partial tool call or partial stream. -- Resume after local cleanup or process restart. -- Repeat with slightly changed inputs while reusing shared state. - -Observe whether the operation is idempotent, duplicated, silently ignored, or left in a partial state. - -## Concurrency Errors - -When shared state, ordering, or isolation may matter, consider: - -- Two overlapping requests with the same logical input. -- Parallel runs that reuse the same cache key, session, container, or temporary resource. -- Concurrent retries, cancellation, or cleanup racing with active work. -- Output or event streams from one run leaking into another. - -Capture whether the runtime serializes, rejects, duplicates, corrupts, or cross-contaminates the work. - -## Investigation Heuristics - -Use these heuristics to pick error cases quickly: - -- Ask which failure a real engineer would debug first in production. -- Ask which failure is most expensive if it is misunderstood. -- Ask which failure would be invisible from code review alone. -- Ask which failure path is likely to differ across environments. - -If the error behavior is already perfectly obvious from a local validator or type system, it is usually low priority for this skill. diff --git a/.agents/skills/runtime-behavior-probe/references/openai-runtime-patterns.md b/.agents/skills/runtime-behavior-probe/references/openai-runtime-patterns.md deleted file mode 100644 index 7aee7683dc..0000000000 --- a/.agents/skills/runtime-behavior-probe/references/openai-runtime-patterns.md +++ /dev/null @@ -1,126 +0,0 @@ -# OpenAI Runtime Patterns - -Use this reference for recurring OpenAI investigations so you do not have to rediscover the probe strategy each time. In this repository, use [$openai-knowledge](../../openai-knowledge/SKILL.md) up front for contract-sensitive details, then use this reference to design the runtime validation. If the docs MCP is unavailable, fall back to the official OpenAI docs and say so in the report. - -## General Rules - -- Prefer small live probes over large harnesses. -- Keep one script focused on one uncertainty. -- For comparative or benchmark-like questions, start with a pilot and expand only when the answer is still unclear. -- Capture both the request shape and the returned item types. -- Preserve raw error payloads and status codes. -- Record whether behavior differs between the first call and a repeated call. -- When the question is about regression or contract drift, add a known-good control run before attributing the result to the change under investigation. -- Keep comparison parity explicit. Record what was held constant, what variable changed, and whether output-shape or usage differences could bias the conclusion. -- When the question depends on tool invocation, force the target path with the matching `tool_choice`. -- Treat `container_auto` and `container_reference` as distinct setup modes, not interchangeable details. -- Clear unsupported model or tool options before diagnosing runtime behavior. - -## Standard Environment Variables - -Do not read these variables automatically. Before a live probe uses any of them, tell the user the exact variable names you plan to read and why each one is needed, then wait for explicit approval. Never print their values: - -- `OPENAI_API_KEY` -- `OPENAI_BASE_URL` -- `OPENAI_ORG_ID` -- `OPENAI_PROJECT_ID` - -If the task targets another standard integration, use that integration's expected default variable names under the same rule. - -## Responses API Probe Patterns - -For Responses API work, start from the uncertainty instead of from the full feature surface. - -### Benchmark or model-switch comparisons - -Use when you need to compare models, settings, transports, or providers with enough rigor to support a product or release decision. - -Probe suggestions: - -- Start with a pilot that includes one control and two or three highest-signal scenarios. -- Keep prompt shape, tool choice, state setup, and non-tested settings aligned across candidates. -- If the question is about speed, capture medians and, when relevant, first-token latency plus any usage note that could explain the difference. -- If the question is about "same intelligence" or "same quality," add at least one harder or more open-ended case. Otherwise report the result as pattern parity only. -- Expand to a larger matrix only when the pilot survives, the candidates are close, or a major runtime surface is still uncovered. - -### Plain response behavior - -Use when you need to confirm: - -- The shape of returned output items. -- Whether text appears in one item or multiple items. -- How metadata appears in the final object. - -Probe suggestions: - -- Baseline call with a minimal input. -- Same call with a slightly different instruction shape. -- Repeat the same call to check output stability where that matters. - -### Structured output behavior - -Use when you need to observe: - -- Schema rejection versus best-effort completion. -- Handling of missing required fields. -- Differences between model-compliant output and transport-level errors. - -Probe suggestions: - -- Valid schema and valid prompt. -- Prompt likely to produce omitted fields. -- Clearly incompatible schema or unsupported option when relevant. - -### Tool invocation behavior - -Use when you need to learn: - -- When tool calls are emitted. -- How arguments are shaped at runtime. -- What happens when the tool fails or returns malformed output. - -Probe suggestions: - -- Baseline tool-call success. -- Tool failure with a realistic exception. -- Tool result that is syntactically valid but semantically incomplete. - -### Hosted shell and code interpreter failure shields - -When probing hosted tools through the Responses API, eliminate common setup ambiguity first: - -- Force the tool path you want to test with the matching `tool_choice`. A text-only completion without forced tool choice is not a reliable negative result. -- Treat `container_auto` and `container_reference` differently. Use `container_auto` when the probe needs fresh container provisioning or skill attachment, and use `container_reference` only to reuse existing container state. -- Do not assume every environment field is accepted on every container mode. If the probe is about skills, validate that the chosen container mode actually supports skill attachment before treating an API error as a runtime defect. -- Check model-specific option support before chasing unrelated failures. Unsupported reasoning or model settings can invalidate the probe before the tool path is exercised. -- For hosted package installation, treat network-dependent setup as best-effort and separate install failures from the underlying tool behavior you are trying to observe. -- For prompt cache investigations, keep model, instructions, tool configuration, and cache key effectively identical across repeated runs before interpreting `cached_tokens`. - -### Streaming behavior - -Use when the uncertainty involves: - -- Event ordering. -- Partial text delivery. -- Termination after interruption. -- Tool-call events in streams. - -Probe suggestions: - -- Normal streamed completion. -- Early local cancellation. -- Network interruption if it can be reproduced safely. - -## What to Capture - -For OpenAI probes, try to record: - -- Request options that materially affect behavior. -- Response item types and their order. -- Whether fields are absent, null, empty, or transformed. -- Server status and error payload details for failures. -- Retry and backoff hints when present. -- Stable identifiers that help compare repeated runs, such as request IDs, response IDs, tool call IDs, or container IDs when available. -- Which environment-variable names were approved for the probe when live credentials were required. - -Do not spend time rediscovering static documentation unless the runtime result seems to contradict what you expected. The value of this skill is in the observed behavior. diff --git a/.agents/skills/runtime-behavior-probe/references/reporting-format.md b/.agents/skills/runtime-behavior-probe/references/reporting-format.md deleted file mode 100644 index 936888eef4..0000000000 --- a/.agents/skills/runtime-behavior-probe/references/reporting-format.md +++ /dev/null @@ -1,118 +0,0 @@ -# Reporting Format - -Lead with findings, not process. The user asked for investigation results, so the answer should start with the most important observed behaviors. Put the real news first. - -## Recommended Order - -1. Findings. -2. Validation approach. -3. Case matrix or condensed case summary. -4. Artifact status and brief run summary. -5. Optional implementation note. - -## Findings Section - -Make each finding answer one user-relevant question. Good findings usually include: - -- What was observed. -- Why it matters. -- The condition under which it happens. -- What was held constant when the finding comes from a comparison probe. -- `scope`: The boundary of the finding, such as commit, model, Python version, live vs local, or repeat mode. -- `confidence`: `high`, `medium`, or `low`. - -Avoid burying the main result under setup details. - -Put `unexpected` or `negative` findings first. If there were no unexpected or negative findings in the executed cases, say that explicitly before the rest of the findings section. - -If the probe was comparative, say whether the result supports: - -- Pattern parity only. -- A broader quality claim. - -Do not imply a broader quality equivalence than the executed cases justify. - -## Validation Approach Section - -Summarize: - -- The runtime surface you exercised. -- The shape of the probe code, in overview only. -- Which categories of cases you covered. -- Which execution modes you used, including repeat counts or warm-up handling when relevant. -- Whether live credentials or external services were used. -- Any important state controls such as fresh state, cache reuse, cache busting, unique IDs, or cleanup verification. -- For comparison probes, what was held constant, what was varied, and whether output-shape or usage differences could still influence the conclusion. -- Whether the usual docs path or an official-docs fallback was used for contract-sensitive checks. - -Keep this concise. The user needs enough detail to trust the result, not a line-by-line replay of the script. - -## Case Summary - -Include either the full matrix or a condensed summary. At minimum, show: - -- Which scenarios were executed. -- Whether the run was a quick pilot, an expanded matrix, or both. -- Which ones produced `unexpected` or `negative` results. -- Which ones passed or failed when a real comparison basis existed. -- Which cases were blocked. -- Where the supporting evidence lived, or that it was deleted. - -If the matrix is large, show the highest-value cases in the main response and keep the rest as a compact appendix or note. - -## Artifact Status And Brief Run Summary - -State one of these explicitly: - -- Temporary artifacts were kept until the final response was drafted, then deleted after validation. -- Temporary artifacts were kept at `` because the user asked to keep them. -- Temporary artifacts were kept at `` because they are needed for follow-up analysis. - -Even if artifacts were deleted, retain a short run summary such as: - -- Probe command or runner shape. -- Runtime context summary such as commit, Python executable, Python version, or model. -- Artifact path and final status. - -For benchmark or repeat-heavy probes, keeping artifacts for follow-up is often the right default even when the immediate report is done. - -## Optional Implementation Note - -Include this only when one clear defect was isolated and a short implementation hypothesis or minimal repro direction would help. Keep it brief. Do not turn the report into a broader next-step plan unless the user asked for that. - -## Compact Template - -Use this outline when you need a fast structure: - - Findings: - - - held constant: - scope: - confidence: - - - held constant: - scope: - confidence: - - Validation approach: - - Surface: - - Probe code: - - Coverage: - - Execution modes: - - Comparison parity: - - Docs source: - - Case summary: - | case_id | scenario | result_flag | status | note | - | --- | --- | --- | --- | --- | - | S1 | ... | expected | pass | ... | - | E1 | ... | negative | fail | ... | - - Artifact status and brief run summary: - - Temporary artifacts were kept until the final response was drafted, then deleted. - - Summary: - - Optional implementation note: - - - -Adjust the format to the task, but preserve the ordering. diff --git a/.agents/skills/runtime-behavior-probe/references/validation-matrix.md b/.agents/skills/runtime-behavior-probe/references/validation-matrix.md deleted file mode 100644 index 60e67826ed..0000000000 --- a/.agents/skills/runtime-behavior-probe/references/validation-matrix.md +++ /dev/null @@ -1,137 +0,0 @@ -# Validation Matrix - -Use the matrix to decide what to probe before writing scripts. The goal is not exhaustive combinatorics; the goal is high-value coverage that is visible, explainable, and likely to reveal runtime surprises. The matrix should make the real news easy to scan. - -## Minimum Columns - -Use these columns unless the task clearly needs more: - -- `case_id`: Stable identifier such as `S1`, `E3`, or `R2`. -- `scenario`: Short description of the behavior under test. -- `mode`: `single-shot`, `repeat-N`, or `warm-up + repeat-N`. -- `question`: The concrete runtime uncertainty this case is answering. -- `setup`: Inputs, environment, or preconditions required for the case. -- `observation_summary`: A compact summary of what actually happened. -- `result_flag`: `unexpected`, `negative`, `expected`, or `blocked`. -- `evidence`: Path, log reference, or `deleted`. - -Add these columns when they materially improve the investigation: - -- `comparison_basis`: The baseline, docs, or prior behavior you are comparing against. -- `variable_under_test`: The single factor that is intentionally changing in a comparison case. -- `held_constant`: Prompt shape, tool setup, model settings, or state rules that were intentionally kept the same. -- `output_constraint`: Any schema, length, or response-shape constraint used to keep the comparison fair. -- `status`: Use `pass`, `fail`, `unexpected-pass`, `unexpected-fail`, or `blocked` only when there is a credible comparison basis or control. -- `confidence`: `high`, `medium`, or `low`. -- `state_setup`: Fresh or reused state, cache strategy, unique IDs, and cleanup checks. -- `repeats`: Number of measured runs. -- `warm_up`: Whether a warm-up run was used and why. -- `variance`: Any useful spread or instability note across repeated runs. -- `usage_note`: Token, usage, or output-length note when it materially affects interpretation. -- `control`: Known-good comparison point for regression or behavior-change questions. -- `risk_profile`: `read-only`, `mutating`, or `costly` for live probes. -- `env_vars`: Exact environment-variable names the case plans to read. -- `approval`: `not-needed`, `pending`, or `approved` for cases that need user permission before execution. - -Use `result_flag` as the fast scan field. It is what makes unexpected or negative findings jump out before the reader studies the full report. - -Use `status` only when you have a real comparison basis. If the case is exploratory and there is no trustworthy baseline, prefer a strong `observation_summary` plus `result_flag` and `confidence` instead of pretending the result is a clean pass or fail. - -## Choosing Execution Mode - -Pick an execution mode before you run the case: - -- Use `single-shot` for deterministic, one-run checks. -- Use `repeat-N` automatically when the question involves cache behavior, retries, streaming, interruptions, rate limiting, concurrency, or other run-to-run-sensitive behavior. -- Use `warm-up + repeat-N` when the first run is likely to include cold-start effects such as container provisioning, import caches, or prompt-cache population. - -Use these defaults unless the task clearly needs something else: - -- `repeat-3` for a quick screen of a repeat-sensitive question. -- `warm-up + repeat-10` for decision-grade latency comparisons or release-facing recommendations. -- For costly live probes, start at `repeat-3` and expand only if the answer is still unclear. - -If it is genuinely unclear whether extra runs are worth the time or cost, ask the user before expanding the probe. - -## Phase The Matrix - -When the question is comparative or benchmark-like, do not jump straight to the largest matrix. - -Start with a pilot: - -- One control. -- One or two highest-signal success cases. -- The smallest repeat count that can disqualify a weak candidate quickly. - -Expand only when: - -- The candidate survives the pilot. -- The results are close enough that more samples matter. -- A major runtime surface is still uncovered. -- The user explicitly wants decision-grade evidence. - -## Coverage Categories - -Try to cover at least one case from each relevant category: - -- `success`: Normal behavior that should work. -- `control`: Known-good comparison such as `origin/main`, the latest release, or the same request without the suspected option. -- `boundary`: Size, count, or parameter limits near a plausible edge. -- `invalid`: Bad inputs or unsupported combinations. -- `misconfig`: Missing key, wrong endpoint, bad permissions, or incompatible local setup. -- `transient`: Timeout, temporary server issue, network breakage, or rate limiting. -- `recovery`: Retry behavior, partial completion, duplicate submission, or cleanup. -- `concurrency`: Overlapping operations when shared state, ordering, or isolation may matter. -- `quality`: A harder or more open-ended sample when the user is asking about model intelligence, not just workflow parity. - -If time is limited, prioritize categories in this order: - -1. Known-good control when the question implies regression or drift. -2. Highest-risk success case. -3. Most plausible user-facing failure. -4. Most likely edge case with ambiguous behavior. -5. Cleanup or retry semantics. -6. Lower-probability extremes. - -## Matrix Template - -Use this compact template: - - | case_id | scenario | mode | question | setup | state_setup | variable_under_test | held_constant | comparison_basis | observation_summary | result_flag | status | evidence | - | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | - | K1 | Known-good control | single-shot | Does the baseline still show the expected behavior? | Same probe against baseline target | Fresh state | none | current probe shape | `origin/main` or latest release | pending | pending | pending | pending | - | S1 | Baseline success | single-shot | What does the normal success path look like at runtime? | Valid config and representative input | Fresh state | none | representative input and setup | current docs or local expectation | pending | pending | pending | pending | - | R1 | Cache or retry behavior | warm-up + repeat-N | Does behavior change after the first run or across retries? | Same request repeated under controlled settings | Cache key or retry setup recorded | reuse versus fresh state | prompt shape and tool setup | same request without reuse, or docs if available | pending | pending | pending | pending | - | C1 | Model comparison pilot | warm-up + repeat-N | Does candidate B preserve the covered behavior while improving latency? | Same scenario across two models | Fresh state and stable IDs | model name | prompt shape, tool choice, and model settings parity | control model in the same probe | pending | pending | pending | pending | - | E1 | Invalid input | single-shot | How does the runtime reject a realistic bad input? | Missing required field | Fresh state | invalid field value | same request with valid field | same request with valid field | pending | pending | pending | pending | - | X1 | Concurrent overlap | repeat-N | Do overlapping runs interfere with each other? | Two or more overlapping operations | Unique IDs plus cleanup verification | overlap timing | same logical input | same request serialized, if available | pending | pending | pending | pending | - -## Recording Results - -Keep `question` unchanged after execution. Put the actual behavior in `observation_summary`, then mark the scan-friendly `result_flag`. - -Use these `result_flag` values consistently: - -- `unexpected`: The result diverged from the best current understanding in a surprising way. -- `negative`: The result exposed a user-relevant failure, risk, or sharp edge. -- `expected`: The result matched the current understanding and did not reveal new risk. -- `blocked`: The case did not produce a trustworthy observation. - -Only fill `status` when there is a credible comparison basis. Otherwise use `observation_summary`, `result_flag`, and `confidence` to communicate what was learned without over-claiming certainty. - -For comparison cases, use `observation_summary` and the final report to say whether the evidence supports pattern parity only or a broader quality claim. - -If a case reveals a new branch of behavior, add a follow-up case instead of overloading the original one. - -## Evidence Discipline - -Treat a case as incomplete when: - -- The observed output omits the key result you were testing. -- The script mixed multiple questions and the result is ambiguous. -- Hidden state, cache behavior, or previous runs may have influenced the result and were not controlled or documented. -- The question is whether behavior changed, but the case has no credible control or baseline to compare against. -- The case plans to read environment variables, but the exact variable names were not approved by the user before execution. -- The case was repeat-sensitive, but it ran only once without a clear rationale. - -When this happens, narrow the probe and rerun. A smaller script with a cleaner result is better than a more complicated script that is hard to trust. diff --git a/.agents/skills/runtime-behavior-probe/templates/python_probe.py b/.agents/skills/runtime-behavior-probe/templates/python_probe.py deleted file mode 100644 index c3e03f6f79..0000000000 --- a/.agents/skills/runtime-behavior-probe/templates/python_probe.py +++ /dev/null @@ -1,227 +0,0 @@ -"""Disposable Python probe scaffold. - -Copy this file to a temporary location and adapt it for one narrow question. -Recommended usage from the repository root: - - uv run python /tmp/probe.py - -If you want structured artifacts for repeat-heavy or benchmark probes: - - PROBE_OUTPUT_DIR=/tmp/probe-run uv run python /tmp/probe.py -""" - -from __future__ import annotations - -import json -import os -import platform -import shutil -import statistics -import subprocess -import sys -import time -import uuid -from collections import Counter, defaultdict -from importlib import metadata -from pathlib import Path - -SCENARIO = "replace-me" -RUN_LABEL = "replace-me" -MODE = "single-shot" -APPROVED_ENV_VARS: list[str] = [] -OUTPUT_DIR_ENV = "PROBE_OUTPUT_DIR" - -RESULTS: list[dict[str, object]] = [] - - -def _git_value(*args: str) -> str: - result = subprocess.run( - ["git", *args], - check=False, - capture_output=True, - text=True, - ) - if result.returncode != 0: - return "unknown" - return result.stdout.strip() or "unknown" - - -def _package_version(name: str) -> str | None: - try: - return metadata.version(name) - except metadata.PackageNotFoundError: - return None - - -def _output_dir() -> Path | None: - value = os.getenv(OUTPUT_DIR_ENV) - if not value: - return None - return Path(value) - - -def _write_json(path: Path, payload: object) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n") - - -def emit(kind: str, **payload: object) -> None: - print( - json.dumps( - { - "ts": round(time.time(), 3), - "kind": kind, - **payload, - }, - sort_keys=True, - ) - ) - - -def runtime_context() -> dict[str, object]: - approved = {name: ("set" if os.getenv(name) else "unset") for name in APPROVED_ENV_VARS} - package_versions = { - name: version - for name in ("openai", "agents") - if (version := _package_version(name)) is not None - } - return { - "scenario": SCENARIO, - "run_label": RUN_LABEL, - "mode": MODE, - "cwd": os.getcwd(), - "script_path": str(Path(__file__).resolve()), - "python_executable": sys.executable, - "python_version": sys.version.split()[0], - "platform": platform.platform(), - "git_commit": _git_value("rev-parse", "HEAD"), - "git_branch": _git_value("rev-parse", "--abbrev-ref", "HEAD"), - "uv_path": shutil.which("uv"), - "package_versions": package_versions, - "approved_env_vars": approved, - "output_dir": str(_output_dir()) if _output_dir() else None, - } - - -def start_case(case_id: str, *, mode: str = MODE, note: str | None = None) -> None: - emit("case_start", case_id=case_id, mode=mode, note=note) - - -def record_case_result( - case_id: str, - observation_summary: str, - result_flag: str, - *, - mode: str = MODE, - is_warmup: bool = False, - total_latency_s: float | None = None, - first_token_latency_s: float | None = None, - metrics: dict[str, object] | None = None, - error: str | None = None, -) -> None: - payload: dict[str, object] = { - "case_id": case_id, - "mode": mode, - "is_warmup": is_warmup, - "observation_summary": observation_summary, - "result_flag": result_flag, - "metrics": metrics or {}, - "error": error, - } - if total_latency_s is not None: - payload["total_latency_s"] = total_latency_s - if first_token_latency_s is not None: - payload["first_token_latency_s"] = first_token_latency_s - RESULTS.append(payload) - emit("case_result", **payload) - - -def summarize_results() -> dict[str, object]: - by_case: defaultdict[str, list[dict[str, object]]] = defaultdict(list) - for result in RESULTS: - by_case[str(result["case_id"])].append(result) - - summary_cases: dict[str, object] = {} - for case_id, items in by_case.items(): - measured = [item for item in items if not bool(item.get("is_warmup"))] - latencies = [ - float(item["total_latency_s"]) - for item in measured - if item.get("total_latency_s") is not None - ] - first_token_latencies = [ - float(item["first_token_latency_s"]) - for item in measured - if item.get("first_token_latency_s") is not None - ] - result_flags = Counter(str(item["result_flag"]) for item in measured or items) - observations = [str(item["observation_summary"]) for item in (measured or items)[:3]] - summary_cases[case_id] = { - "mode": str(items[-1]["mode"]), - "runs": len(measured), - "warmups": len(items) - len(measured), - "result_flags": dict(result_flags), - "median_total_latency_s": (statistics.median(latencies) if latencies else None), - "mean_total_latency_s": statistics.mean(latencies) if latencies else None, - "median_first_token_latency_s": ( - statistics.median(first_token_latencies) if first_token_latencies else None - ), - "observations": observations, - } - - return { - "scenario": SCENARIO, - "run_label": RUN_LABEL, - "mode": MODE, - "result_count": len(RESULTS), - "cases": summary_cases, - "result_flags": dict(Counter(str(item["result_flag"]) for item in RESULTS)), - } - - -def finalize(exit_code: int) -> None: - metadata_payload = { - "exit_code": exit_code, - "runtime_context": runtime_context(), - } - summary_payload = summarize_results() - emit("summary", metadata=metadata_payload, summary=summary_payload) - - output_dir = _output_dir() - if not output_dir: - return - - metadata_path = output_dir / "metadata.json" - results_path = output_dir / "results.json" - summary_path = output_dir / "summary.json" - _write_json(metadata_path, metadata_payload) - _write_json(results_path, RESULTS) - _write_json(summary_path, summary_payload) - emit( - "artifact_paths", - metadata_path=str(metadata_path), - results_path=str(results_path), - summary_path=str(summary_path), - ) - - -def main() -> int: - case_id = os.getenv("PROBE_CASE_ID", f"case-{uuid.uuid4().hex[:8]}") - emit("banner", context=runtime_context()) - start_case(case_id) - - # Replace this block with the narrow runtime question you want to test. - observation = "replace-me" - result_flag = "expected" - - record_case_result( - case_id=case_id, - observation_summary=observation, - result_flag=result_flag, - ) - finalize(exit_code=0) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/.agents/skills/test-coverage-improver/SKILL.md b/.agents/skills/test-coverage-improver/SKILL.md deleted file mode 100644 index 2dff569bd5..0000000000 --- a/.agents/skills/test-coverage-improver/SKILL.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -name: test-coverage-improver -description: 'Improve test coverage in the OpenAI Agents Python repository: run `make coverage`, inspect coverage artifacts, identify low-coverage files, propose high-impact tests, and confirm with the user before writing tests.' ---- - -# Test Coverage Improver - -## Overview - -Use this skill whenever coverage needs assessment or improvement (coverage regressions, failing thresholds, or user requests for stronger tests). It runs the coverage suite, analyzes results, highlights the biggest gaps, and prepares test additions while confirming with the user before changing code. - -## Quick Start - -1. From the repo root run `make coverage` to regenerate `.coverage` data and `coverage.xml`. -2. Collect artifacts: `.coverage` and `coverage.xml`, plus the console output from `coverage report -m` for drill-downs. -3. Summarize coverage: total percentages, lowest files, and uncovered lines/paths. -4. Draft test ideas per file: scenario, behavior under test, expected outcome, and likely coverage gain. -5. Ask the user for approval to implement the proposed tests; pause until they agree. -6. After approval, write the tests in `tests/`, rerun `make coverage`, and then run `$code-change-verification` before marking work complete. - -## Workflow Details - -- **Run coverage**: Execute `make coverage` at repo root. Avoid watch flags and keep prior coverage artifacts only if comparing trends. -- **Parse summaries efficiently**: - - Prefer the console output from `coverage report -m` for file-level totals; fallback to `coverage.xml` for tooling or spreadsheets. - - Use `uv run coverage html` to generate `htmlcov/index.html` if you need an interactive drill-down. -- **Prioritize targets**: - - Public APIs or shared utilities in `src/agents/` before examples or docs. - - Files with low statement coverage or newly added code at 0%. - - Recent bug fixes or risky code paths (error handling, retries, timeouts, concurrency). -- **Design impactful tests**: - - Hit uncovered paths: error cases, boundary inputs, optional flags, and cancellation/timeouts. - - Cover combinational logic rather than trivial happy paths. - - Place tests under `tests/` and avoid flaky async timing. -- **Coordinate with the user**: Present a numbered, concise list of proposed test additions and expected coverage gains. Ask explicitly before editing code or fixtures. -- **After implementation**: Rerun coverage, report the updated summary, and note any remaining low-coverage areas. - -## Notes - -- Keep any added comments or code in English. -- Do not create `scripts/`, `references/`, or `assets/` unless needed later. -- If coverage artifacts are missing or stale, rerun `pnpm test:coverage` instead of guessing. diff --git a/.agents/skills/test-coverage-improver/agents/openai.yaml b/.agents/skills/test-coverage-improver/agents/openai.yaml deleted file mode 100644 index d512de45d8..0000000000 --- a/.agents/skills/test-coverage-improver/agents/openai.yaml +++ /dev/null @@ -1,4 +0,0 @@ -interface: - display_name: "Test Coverage Improver" - short_description: "Analyze coverage gaps and propose high-impact tests" - default_prompt: "Use $test-coverage-improver to analyze coverage gaps, propose high-impact tests, and update coverage after approval." diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 70f648cc5b..b24179a47d 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -34,3 +34,8 @@ Upstream sync is managed via Ai-road-4-You/fork-sync. - Keep dependencies updated via Dependabot - No hardcoded secrets β€” use GitHub Secrets or environment variables - Follow OWASP Top 10 security practices + +## AgentHub Integration +- Skills: `.agents/skills/` in this repo links to shared AgentHub skills +- 14 shared agents available (api, architect, cli, deploy, developer, docker, docs, orchestrator, performance, refactor, reviewer, security, tester, troubleshoot) +- MCP: 12 servers (GitHub, Supabase, Playwright, MongoDB, Notion, HuggingFace, etc.) diff --git a/CLAUDE.md b/CLAUDE.md index 5e01a1c3d5..ccb37cef25 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1 +1,7 @@ -Read the AGENTS.md file for instructions. \ No newline at end of file +Read the AGENTS.md file for instructions. +## AgentHub +- Central hub: `~/AgentHub/` +- Skills: `.agents/skills/` (symlinked to AgentHub shared skills) +- MCP: 12 servers synced across all agents +- Agents: 14 shared agents available +- Hooks: Safety, notification, and logging hooks